diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..33b024a --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +api/schema/*.json text eol=lf +internal/schema/schema/*.json text eol=lf diff --git a/README.md b/README.md index f4f37a9..fe63865 100644 --- a/README.md +++ b/README.md @@ -1,45 +1,70 @@ # Bering -Bering builds a resilience model artifact from trace artifacts. The output artifact is `bering-model.json` in `BeringResilienceModel` v1.0.0 format. Use it in any downstream tooling or analytics pipeline. +Bering is a discovery and publishing layer for service topology and endpoint contracts. -## Contract +It supports two operating modes: -Bering pins `metadata.schema` exactly: +- deterministic batch discovery from trace files and directories +- long-running runtime discovery that accepts OTLP/HTTP spans and publishes rolling snapshot envelopes for observability consumers + +Bering owns discovery and discovery-side public contracts. It does not own simulation, gating, chaos execution, or policy decisions. + +## Public Artifacts + +Bering currently publishes two versioned JSON artifact types. + +### Core model - `name`: `io.mb3r.bering.model` - `version`: `1.0.0` - `uri`: `https://mb3r-lab.github.io/Bering/schema/model/v1.0.0/model.schema.json` - `digest`: `sha256:272277c093f37580adcd2dded225bd37c86539d642d7910baad7e4228227d1a7` -Any mismatch fails validation. +This remains the simple stable topology artifact for file-based users and downstream tools such as Sheaft. + +### Snapshot envelope + +- `name`: `io.mb3r.bering.snapshot` +- `version`: `1.0.0` +- `uri`: `https://mb3r-lab.github.io/Bering/schema/snapshot/v1.0.0/snapshot.schema.json` +- `digest`: `sha256:87e4e887ed4a37b72f6136e268b73552eccb92941c4de2c6f3a514dd066ea972` + +This wraps the core model with runtime window metadata, ingest counts, support summaries, provenance, and topology diffs. + +`bering validate` accepts either artifact type. ## Repository layout ```text cmd/bering CLI entrypoint internal/app command wiring -internal/connectors/traces trace file/dir loading and normalization -internal/discovery model inference from normalized spans -internal/model model structs, semantic checks, canonical IO +internal/config serve-mode config parsing and validation +internal/connectors/traces file/dir trace loading and normalization +internal/connectors/otlp OTLP/HTTP request decoding into normalized spans +internal/discovery source-agnostic discovery engine and overlay application +internal/model stable core model structs, semantic checks, canonical IO +internal/overlay generic discovery overlay loader +internal/runtime long-running service, tumbling windows, sinks, metrics internal/schema pinned contract constants + JSON Schema validation -internal/jsoncanon deterministic recursive JSON encoder -api/schema versioned public schema -configs sample configs (replicas override) -examples trace fixtures + expected output artifacts -docs format, heuristic, and MVP limits +internal/snapshot snapshot envelope structs, diffing, canonical IO +api/schema public schemas published via GitHub Pages +configs sample serve and overlay configs +examples traces, outputs, collector/prometheus/grafana examples +docs architecture, contract, config, migration, limits scripts/ci CI helper scripts ``` ## Commands ```bash -bering discover --input [--out bering-model.json] [--replicas replicas.yaml|json] [--discovered-at RFC3339] -bering validate --input +bering discover --input [--out bering-model.json] [--snapshot-out bering-snapshot.json] [--replicas replicas.yaml|json] [--overlay overlay.yaml] [--discovered-at RFC3339] +bering validate --input +bering serve --config configs/serve.sample.yaml [--listen :4318] [--window-size 30s] [--flush-interval 5s] ``` ## Quickstart -### 1) Discover from traces +### 1) Batch discovery from traces ```bash go run ./cmd/bering discover \ @@ -48,59 +73,119 @@ go run ./cmd/bering discover \ --discovered-at 2026-03-03T00:00:00Z ``` -### 2) Validate artifact +### 2) Validate the model artifact ```bash go run ./cmd/bering validate \ --input examples/outputs/bering-model.normalized.sample.json ``` -### 3) Use the model in any downstream tool +### 3) Generate a snapshot envelope in batch mode -Examples: +```bash +go run ./cmd/bering discover \ + --input examples/traces/normalized.sample.json \ + --out out/bering-model.json \ + --snapshot-out examples/outputs/bering-snapshot.normalized.sample.json \ + --overlay configs/discovery.overlay.sample.yaml \ + --discovered-at 2026-03-03T00:00:00Z +``` -- run your own analytics (topology checks, risk scoring, SLO diagnostics) -- feed the model into simulation tooling -- simulate failures with [Sheaft](https://github.com/MB3R-Lab/Sheaft) (one possible consumer), for example: - ```bash - # from sibling Sheaft repository - go run ./cmd/sheaft run \ - --model ../Bering/examples/outputs/bering-model.normalized.sample.json \ - --policy configs/gate.policy.example.yaml \ - --out-dir out \ - --seed 42 - ``` +### 4) Run the runtime service -## Deterministic output +```bash +go run ./cmd/bering serve --config configs/serve.sample.yaml +``` -Bering output is deterministic for identical inputs and flags: +The runtime service exposes: + +- `POST /v1/traces` for OTLP/HTTP trace ingest +- `GET /healthz` +- `GET /readyz` +- `GET /metrics` + +The primary integration path is standard OpenTelemetry Collector or SDK exporters sending spans to Bering over OTLP/HTTP. No custom Collector build is required. + +### 5) Use the stable model with Sheaft + +```bash +# from a sibling Sheaft repository +go run ./cmd/sheaft run \ + --model ../Bering/examples/outputs/bering-model.normalized.sample.json \ + --policy configs/gate.policy.example.yaml \ + --out-dir out \ + --seed 42 +``` + +## Examples + +- Batch inputs: [examples/traces/normalized.sample.json](examples/traces/normalized.sample.json), [examples/traces/otel.sample.json](examples/traces/otel.sample.json) +- Batch outputs: [examples/outputs/bering-model.normalized.sample.json](examples/outputs/bering-model.normalized.sample.json), [examples/outputs/bering-snapshot.normalized.sample.json](examples/outputs/bering-snapshot.normalized.sample.json) +- Runtime config: [configs/serve.sample.yaml](configs/serve.sample.yaml) +- Discovery overlay: [configs/discovery.overlay.sample.yaml](configs/discovery.overlay.sample.yaml) +- Collector sidecar: [examples/collector/otelcol.sidecar.yaml](examples/collector/otelcol.sidecar.yaml) +- Prometheus scrape config: [examples/prometheus/bering.prometheus.yml](examples/prometheus/bering.prometheus.yml) +- Grafana dashboard: [examples/grafana/bering-runtime-dashboard.json](examples/grafana/bering-runtime-dashboard.json) + +## Determinism and Runtime Tradeoffs + +Batch output remains deterministic for identical inputs and flags: - services sorted by `id` - edges sorted by `(from,to,kind,blocking)` - endpoints sorted by `id` -- stable IDs for services/edges/endpoints -- canonical JSON writer with recursive object-key ordering (future-safe for map fields) +- canonical JSON output with stable object-key ordering - optional `--discovered-at` for reproducible timestamps -## Supported trace formats (MVP) +Runtime mode is intentionally bounded, not lossless: + +- one active tumbling window is retained in memory +- the previous emitted snapshot is retained for diffs and carry-forward runtime timestamps +- `runtime.max_in_memory_spans` bounds retained spans per active window +- late spans follow `drop` or `current_window` policy +- spans beyond the configured in-memory cap are dropped and surfaced via metrics/logs +- empty windows are advanced without emitting empty snapshots + +## Discovery overlays + +Discovery overlays are additive metadata inputs with explicit precedence by file order. They are intended for discovery-side enrichment, not policy evaluation. + +Supported examples include: + +- service labels and failure-eligibility labels +- endpoint predicate references +- workload or endpoint weights +- SLO references or tags +- replica overrides -- Normalized JSON: `{"spans": [...]}` payload with canonical span fields. -- Raw OTel JSON: `resourceSpans/scopeSpans/spans` payload. +See [configs/discovery.overlay.sample.yaml](configs/discovery.overlay.sample.yaml). -Details: [docs/trace-input-format.md](docs/trace-input-format.md) +## Metrics -## Schema publishing +The runtime service exports Prometheus/OpenMetrics metrics including: -Schema publishing is automated via GitHub Pages and release tags. +- `spans_ingested_total` +- `spans_dropped_total` +- `snapshots_emitted_total` +- `snapshot_build_duration_seconds` +- `current_services` +- `current_edges` +- `current_endpoints` +- `window_lag_seconds` +- `last_snapshot_unixtime` +- `snapshot_age_seconds` +- `diff_added_*` +- `diff_removed_*` +- `diff_changed_*` -- Workflow: `.github/workflows/publish-schema.yml` -- Trigger: tags matching `schema-v*` (for example `schema-v1.0.0`) -- Published paths: - - `https://mb3r-lab.github.io/Bering/schema/model/v1.0.0/model.schema.json` - - `https://mb3r-lab.github.io/Bering/schema/model/latest/model.schema.json` - - `https://mb3r-lab.github.io/Bering/schema/index.json` +## Additional docs -Operational steps are documented in [docs/schema-publishing.md](docs/schema-publishing.md). +- [docs/architecture.md](docs/architecture.md) +- [docs/runtime-config.md](docs/runtime-config.md) +- [docs/trace-input-format.md](docs/trace-input-format.md) +- [docs/schema-publishing.md](docs/schema-publishing.md) +- [docs/migration-notes.md](docs/migration-notes.md) +- [docs/mvp-scope-and-limits.md](docs/mvp-scope-and-limits.md) ## CI and local checks diff --git a/api/schema/snapshot.schema.json b/api/schema/snapshot.schema.json new file mode 100644 index 0000000..6f9cdf7 --- /dev/null +++ b/api/schema/snapshot.schema.json @@ -0,0 +1,330 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://mb3r-lab.github.io/Bering/schema/snapshot/v1.0.0/snapshot.schema.json", + "title": "BeringDiscoverySnapshot", + "type": "object", + "required": [ + "snapshot_id", + "topology_version", + "window_start", + "window_end", + "ingest", + "counts", + "coverage", + "sources", + "diff", + "discovery", + "model", + "metadata" + ], + "properties": { + "snapshot_id": { "type": "string" }, + "topology_version": { "type": "string" }, + "window_start": { "type": "string", "format": "date-time" }, + "window_end": { "type": "string", "format": "date-time" }, + "ingest": { + "type": "object", + "required": ["spans", "traces", "dropped_spans", "late_spans"], + "properties": { + "spans": { "type": "integer", "minimum": 0 }, + "traces": { "type": "integer", "minimum": 0 }, + "dropped_spans": { "type": "integer", "minimum": 0 }, + "late_spans": { "type": "integer", "minimum": 0 } + } + }, + "counts": { + "type": "object", + "required": ["services", "edges", "endpoints"], + "properties": { + "services": { "type": "integer", "minimum": 0 }, + "edges": { "type": "integer", "minimum": 0 }, + "endpoints": { "type": "integer", "minimum": 0 } + } + }, + "coverage": { + "type": "object", + "required": ["confidence", "service_support_min", "edge_support_min", "endpoint_support_min"], + "properties": { + "confidence": { "type": "number", "minimum": 0, "maximum": 1 }, + "service_support_min": { "type": "integer", "minimum": 0 }, + "edge_support_min": { "type": "integer", "minimum": 0 }, + "endpoint_support_min": { "type": "integer", "minimum": 0 } + } + }, + "sources": { + "type": "array", + "items": { + "type": "object", + "required": ["type"], + "properties": { + "type": { "type": "string" }, + "connector": { "type": "string" }, + "ref": { "type": "string" }, + "observations": { "type": "integer", "minimum": 0 } + } + } + }, + "diff": { + "type": "object", + "required": [ + "added_services", + "removed_services", + "changed_services", + "added_edges", + "removed_edges", + "changed_edges", + "added_endpoints", + "removed_endpoints", + "changed_endpoints" + ], + "properties": { + "added_services": { "type": "integer", "minimum": 0 }, + "removed_services": { "type": "integer", "minimum": 0 }, + "changed_services": { "type": "integer", "minimum": 0 }, + "added_edges": { "type": "integer", "minimum": 0 }, + "removed_edges": { "type": "integer", "minimum": 0 }, + "changed_edges": { "type": "integer", "minimum": 0 }, + "added_endpoints": { "type": "integer", "minimum": 0 }, + "removed_endpoints": { "type": "integer", "minimum": 0 }, + "changed_endpoints": { "type": "integer", "minimum": 0 } + } + }, + "discovery": { + "type": "object", + "required": ["services", "edges", "endpoints"], + "properties": { + "services": { + "type": "array", + "items": { "$ref": "#/$defs/serviceRecord" } + }, + "edges": { + "type": "array", + "items": { "$ref": "#/$defs/edgeRecord" } + }, + "endpoints": { + "type": "array", + "items": { "$ref": "#/$defs/endpointRecord" } + }, + "overlays": { + "type": "array", + "items": { + "type": "object", + "required": ["name", "precedence"], + "properties": { + "name": { "type": "string" }, + "ref": { "type": "string" }, + "precedence": { "type": "integer", "minimum": 0 } + } + } + } + } + }, + "model": { + "type": "object", + "required": ["services", "edges", "endpoints", "metadata"], + "properties": { + "services": { + "type": "array", + "items": { + "type": "object", + "required": ["id", "name", "replicas"], + "properties": { + "id": { "type": "string" }, + "name": { "type": "string" }, + "replicas": { "type": "integer", "minimum": 0 } + } + } + }, + "edges": { + "type": "array", + "items": { + "type": "object", + "required": ["from", "to", "kind", "blocking"], + "properties": { + "from": { "type": "string" }, + "to": { "type": "string" }, + "kind": { "type": "string", "enum": ["sync", "async"] }, + "blocking": { "type": "boolean" } + } + } + }, + "endpoints": { + "type": "array", + "items": { + "type": "object", + "required": ["id", "entry_service", "success_predicate_ref"], + "properties": { + "id": { "type": "string" }, + "entry_service": { "type": "string" }, + "success_predicate_ref": { "type": "string" } + } + } + }, + "metadata": { + "type": "object", + "required": ["source_type", "source_ref", "discovered_at", "confidence", "schema"], + "properties": { + "source_type": { "type": "string" }, + "source_ref": { "type": "string" }, + "discovered_at": { "type": "string", "format": "date-time" }, + "confidence": { "type": "number", "minimum": 0, "maximum": 1 }, + "schema": { "$ref": "#/$defs/schemaRef" } + } + } + } + }, + "metadata": { + "type": "object", + "required": ["source_type", "source_ref", "emitted_at", "confidence", "schema"], + "properties": { + "source_type": { "type": "string" }, + "source_ref": { "type": "string" }, + "emitted_at": { "type": "string", "format": "date-time" }, + "confidence": { "type": "number", "minimum": 0, "maximum": 1 }, + "schema": { "$ref": "#/$defs/schemaRef" } + } + } + }, + "$defs": { + "schemaRef": { + "type": "object", + "required": ["name", "version", "uri", "digest"], + "properties": { + "name": { "type": "string" }, + "version": { "type": "string" }, + "uri": { "type": "string" }, + "digest": { "type": "string" } + } + }, + "provenance": { + "type": "object", + "required": ["type"], + "properties": { + "type": { "type": "string" }, + "connector": { "type": "string" }, + "name": { "type": "string" }, + "ref": { "type": "string" }, + "precedence": { "type": "integer", "minimum": 0 } + } + }, + "support": { + "type": "object", + "required": ["observations", "trace_count"], + "properties": { + "observations": { "type": "integer", "minimum": 0 }, + "trace_count": { "type": "integer", "minimum": 0 }, + "evidence": { + "type": "array", + "items": { "type": "string" } + } + } + }, + "commonMetadata": { + "type": "object", + "properties": { + "labels": { + "type": "object", + "additionalProperties": { "type": "string" } + }, + "tags": { + "type": "array", + "items": { "type": "string" } + }, + "slo_refs": { + "type": "array", + "items": { "type": "string" } + }, + "attributes": { + "type": "object", + "additionalProperties": { "type": "string" } + } + } + }, + "serviceRecord": { + "type": "object", + "required": ["id", "name", "replicas", "support"], + "properties": { + "id": { "type": "string" }, + "name": { "type": "string" }, + "replicas": { "type": "integer", "minimum": 0 }, + "support": { "$ref": "#/$defs/support" }, + "first_seen": { "type": "string", "format": "date-time" }, + "last_seen": { "type": "string", "format": "date-time" }, + "provenance": { + "type": "array", + "items": { "$ref": "#/$defs/provenance" } + }, + "metadata": { + "allOf": [ + { "$ref": "#/$defs/commonMetadata" }, + { + "type": "object", + "properties": { + "failure_eligible": { "type": "boolean" }, + "replicas_override": { "type": "integer", "minimum": 0 } + } + } + ] + } + } + }, + "edgeRecord": { + "type": "object", + "required": ["id", "from", "to", "kind", "blocking", "support"], + "properties": { + "id": { "type": "string" }, + "from": { "type": "string" }, + "to": { "type": "string" }, + "kind": { "type": "string", "enum": ["sync", "async"] }, + "blocking": { "type": "boolean" }, + "support": { "$ref": "#/$defs/support" }, + "first_seen": { "type": "string", "format": "date-time" }, + "last_seen": { "type": "string", "format": "date-time" }, + "provenance": { + "type": "array", + "items": { "$ref": "#/$defs/provenance" } + }, + "metadata": { + "allOf": [ + { "$ref": "#/$defs/commonMetadata" }, + { + "type": "object", + "properties": { + "weight": { "type": "number" } + } + } + ] + } + } + }, + "endpointRecord": { + "type": "object", + "required": ["id", "entry_service", "support"], + "properties": { + "id": { "type": "string" }, + "entry_service": { "type": "string" }, + "method": { "type": "string" }, + "path": { "type": "string" }, + "support": { "$ref": "#/$defs/support" }, + "first_seen": { "type": "string", "format": "date-time" }, + "last_seen": { "type": "string", "format": "date-time" }, + "provenance": { + "type": "array", + "items": { "$ref": "#/$defs/provenance" } + }, + "metadata": { + "allOf": [ + { "$ref": "#/$defs/commonMetadata" }, + { + "type": "object", + "properties": { + "weight": { "type": "number" }, + "predicate_ref": { "type": "string" } + } + } + ] + } + } + } + } +} diff --git a/build/Dockerfile b/build/Dockerfile index 5a546eb..8eb04b2 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.23 AS builder +FROM golang:1.24 AS builder WORKDIR /src COPY go.mod go.sum* ./ @@ -10,4 +10,3 @@ RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o /out/bering ./cmd/bering FROM gcr.io/distroless/static-debian12 COPY --from=builder /out/bering /usr/local/bin/bering ENTRYPOINT ["/usr/local/bin/bering"] - diff --git a/configs/discovery.overlay.sample.yaml b/configs/discovery.overlay.sample.yaml new file mode 100644 index 0000000..2b09ce8 --- /dev/null +++ b/configs/discovery.overlay.sample.yaml @@ -0,0 +1,32 @@ +name: sample-discovery-overlay +services: + - id: checkout + replicas: 3 + failure_eligible: true + labels: + team: commerce + tier: backend + slo_refs: + - slo:checkout-latency + - id: payment + labels: + team: payments + tier: backend +edges: + - from: frontend + to: checkout + kind: sync + blocking: true + weight: 0.8 + labels: + flow: checkout +endpoints: + - id: checkout:POST /process + predicate_ref: catalog.checkout.success + weight: 0.6 + labels: + workflow: purchase + - id: frontend:GET /checkout + weight: 0.4 + labels: + workflow: browse diff --git a/configs/serve.sample.yaml b/configs/serve.sample.yaml new file mode 100644 index 0000000..d7dea00 --- /dev/null +++ b/configs/serve.sample.yaml @@ -0,0 +1,15 @@ +server: + listen_address: ":4318" + max_request_bytes: 5242880 +runtime: + flush_interval: 5s + window_size: 30s + max_in_memory_spans: 10000 + late_span_policy: drop +sink: + directory: out/snapshots + latest_path: out/latest-snapshot.json +logging: + structured: true +overlays: + - configs/discovery.overlay.sample.yaml diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..6752b1b --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,106 @@ +# Architecture + +## Overview + +Bering now has two user-facing flows built on one normalized discovery core. + +### Batch flow + +1. `bering discover` loads trace JSON files or directories. +2. File inputs are normalized into the internal `traces.Span` shape. +3. The discovery engine infers services, edges, endpoints, confidence, and overlay-driven metadata. +4. Bering writes the stable `io.mb3r.bering.model` artifact. +5. Optional `--snapshot-out` also writes a `io.mb3r.bering.snapshot` envelope. + +### Runtime flow + +1. `bering serve` accepts OTLP/HTTP `POST /v1/traces` requests. +2. OTLP requests are normalized into the same internal `traces.Span` shape used by batch mode. +3. Spans are accumulated in a bounded active tumbling window. +4. On schedule, the window closes and Bering runs discovery. +5. Bering computes a stable topology digest, diffs against the previous snapshot, and writes the snapshot to sinks. +6. Metrics and health endpoints expose runtime operability. + +## Internal interfaces and seams + +The runtime additions are intentionally interface-driven so future connectors can slot in without rewriting the discovery engine. + +- source adapters: normalize external data into internal spans or overlays +- overlay loaders: decode reusable discovery-side metadata files +- snapshot sinks: write snapshot envelopes to one or more destinations +- clocks: make windowing deterministic in tests +- storage boundary: one active in-memory tumbling window with an explicit cap + +## Normalized span model + +`internal/connectors/traces.Span` remains the small shared runtime shape. + +Fields used today: + +- trace identity: `trace_id`, `span_id`, `parent_span_id` +- service identity: `service` +- operation naming: `name`, `kind` +- timestamps: `start_time`, `end_time` +- discovery-relevant attributes: HTTP and messaging attributes + +This keeps discovery logic source-agnostic. File JSON, OTLP/HTTP, and future adapters all converge on the same shape. + +## Contracts + +### Stable model contract + +`io.mb3r.bering.model` v1.0.0 remains the simple downstream-facing topology contract. + +It is still the default output for file-based users and remains deterministic for identical inputs. + +### Snapshot contract + +`io.mb3r.bering.snapshot` v1.0.0 is additive. It wraps the stable model with: + +- `snapshot_id` +- `topology_version` +- window boundaries +- ingest counts +- support summaries +- provenance +- overlay application metadata +- diff summary + +This lets runtime consumers observe topology drift without forcing existing model readers to migrate immediately. + +## Overlay precedence + +Overlay precedence is explicit and deterministic. + +1. inferred runtime/batch discovery +2. legacy replica override file when used +3. overlay files in the order they are provided +4. command flags override config fields, not overlay content + +Overlays enrich discovery artifacts. They do not introduce gating, pass/fail semantics, or execution logic. + +## Async/sync edge classification + +Edge kind remains heuristic and extensible. + +Current signals: + +- span kind `producer` or `consumer` +- messaging attributes such as `messaging.system`, `messaging.destination`, `messaging.operation` +- otherwise default to `sync` + +The snapshot contract records edge support evidence so downstream consumers can reason about heuristic classification without Bering pretending it observed a hard truth. + +## Memory bounds and late spans + +Runtime mode intentionally trades completeness for bounded memory. + +- only the active tumbling window is retained +- only the previous emitted snapshot is retained for diffs and carry-forward `first_seen` / `last_seen` +- `runtime.max_in_memory_spans` bounds retained spans in that window +- excess spans are dropped and surfaced in metrics/logs +- late spans follow one of two policies: + - `drop` + - `current_window` + +This keeps Bering operational as a discovery/publishing layer rather than a stateful analysis engine. diff --git a/docs/migration-notes.md b/docs/migration-notes.md new file mode 100644 index 0000000..f05d5d2 --- /dev/null +++ b/docs/migration-notes.md @@ -0,0 +1,39 @@ +# Migration Notes + +## Batch users + +Existing file-based workflows continue to work. + +- `bering discover` still writes the same `io.mb3r.bering.model` v1.0.0 artifact by default +- `bering validate` still validates the stable model contract +- deterministic output for identical inputs and flags is preserved + +If you do nothing, your existing batch model flow should remain unchanged. + +## New optional capabilities + +You can now opt into additional discovery-side features. + +- `--overlay` for generic discovery metadata overlays +- `--snapshot-out` for a snapshot envelope in batch mode +- `bering serve` for OTLP/HTTP runtime discovery + +## Sheaft and other downstream model consumers + +The stable model contract is unchanged. + +Downstream model readers do not need to understand the snapshot envelope unless they want runtime drift, support, or provenance details. + +Recommended migration path for downstream consumers: + +1. keep consuming `io.mb3r.bering.model` v1.0.0 unchanged +2. add optional snapshot-envelope parsing only when runtime observability features are needed +3. treat overlay-derived metadata as advisory discovery context, not execution policy + +## Success predicate references + +Bering does not infer brittle success predicates from traces alone. + +The stable model still carries `success_predicate_ref` for compatibility. Without an overlay, this remains a stable self-reference rather than a claim that Bering learned a complete success rule from traces. + +If you have an external endpoint predicate catalog, provide it through overlays. diff --git a/docs/mvp-scope-and-limits.md b/docs/mvp-scope-and-limits.md index a6cbcd2..a955391 100644 --- a/docs/mvp-scope-and-limits.md +++ b/docs/mvp-scope-and-limits.md @@ -1,22 +1,25 @@ -# MVP Scope and Limits +# Scope and Limits ## In scope -- Discover model from trace files/directories. -- Emit contract-compliant `bering-model.json` (v1.0.0) compatible with Sheaft and other downstream analytics tools. -- Validate artifact with strict `metadata.schema` match. -- Deterministic output ordering and canonical JSON serialization. +- deterministic batch discovery from trace files/directories +- long-running runtime service that accepts OTLP/HTTP spans +- stable core model artifacts (`io.mb3r.bering.model`) +- snapshot envelopes for observability/runtime consumers (`io.mb3r.bering.snapshot`) +- generic discovery overlays for metadata and predicate references +- Prometheus/OpenMetrics runtime metrics and health endpoints -## Out of scope (MVP) +## Explicitly out of scope -- Non-trace connectors (runtime service registries, topology APIs). -- Advanced dependency semantics (timeouts, retries, circuit breakers). -- Correlated failures and probabilistic calibration from production data. -- Multi-version schema negotiation. +- simulation execution +- gating or approval policy decisions +- chaos orchestration +- benchmark-specific rules or benchmark-specific naming +- claiming trace-only success semantics that were not supplied externally -## Confidence heuristic (MVP) +## Confidence heuristic -`metadata.confidence` is deterministic and computed from discovery coverage: +`metadata.confidence` remains deterministic and informational. - base score: `0.35` - `+0.20` if more than one service discovered @@ -24,7 +27,18 @@ - `+0.15` if at least one endpoint inferred - `+min(0.15, cross_service_parent_child_ratio * 0.15)` -Result is clamped to `[0,1]` and rounded to 2 decimals. +Result is clamped to `[0,1]` and rounded to two decimals. -This score is informational in MVP and should not be interpreted as a calibrated -probability of correctness. +This score is not a calibrated probability and must not be treated as a gate. + +## Runtime memory and lateness limits + +Runtime mode intentionally keeps memory bounded. + +- one active tumbling window is retained +- retained spans are capped by config +- buffer overflow drops spans and reports them in metrics/logs +- late spans follow `drop` or `current_window` policy +- empty windows are advanced without emitting empty artifacts + +These are operational tradeoffs, not bugs. They keep Bering in the role of a discovery/publishing layer. diff --git a/docs/runtime-config.md b/docs/runtime-config.md new file mode 100644 index 0000000..69eac58 --- /dev/null +++ b/docs/runtime-config.md @@ -0,0 +1,98 @@ +# Runtime Config + +`bering serve` reads YAML or JSON config. Flags may override config values. + +Sample file: [configs/serve.sample.yaml](../configs/serve.sample.yaml) + +## Fields + +### `server.listen_address` + +TCP listen address for the HTTP server. + +Example: `":4318"` + +### `server.max_request_bytes` + +Maximum OTLP request body size in bytes after decompression. + +Example: `5242880` + +### `runtime.flush_interval` + +How often Bering checks whether the active window should close. + +Example: `5s` + +Tradeoff: smaller values reduce `window_lag_seconds` but wake the service more often. + +### `runtime.window_size` + +Tumbling window size for discovery snapshots. + +Example: `30s` + +Tradeoff: smaller windows react faster to topology drift; larger windows produce steadier topology snapshots and reduce churn. + +### `runtime.max_in_memory_spans` + +Maximum number of spans retained in the active window before new spans are dropped. + +Example: `10000` + +Tradeoff: higher values improve coverage but increase peak memory. + +### `runtime.late_span_policy` + +How Bering handles spans whose event time falls before the active window start. + +Allowed values: + +- `drop` +- `current_window` + +`drop` keeps window boundaries strict. `current_window` preserves more evidence at the cost of blur between scheduled windows. + +### `sink.directory` + +Directory sink for one file per emitted snapshot. + +Example: `out/snapshots` + +### `sink.latest_path` + +Optional stable path that is rewritten on every successful snapshot. + +Example: `out/latest-snapshot.json` + +### `logging.structured` + +Emit JSON logs instead of text logs. + +Example: `true` + +### `overlays` + +Ordered list of overlay files to apply after inferred discovery. + +Example: + +```yaml +overlays: + - configs/discovery.overlay.sample.yaml +``` + +## Flag overrides + +`bering serve` supports these override flags: + +- `--config` +- `--listen` +- `--flush-interval` +- `--window-size` +- `--max-in-memory-spans` +- `--late-span-policy` +- `--sink-dir` +- `--latest-path` +- `--log-format text|json` +- `--overlay` (repeatable) diff --git a/docs/schema-publishing.md b/docs/schema-publishing.md index 37dd20d..1ac17a6 100644 --- a/docs/schema-publishing.md +++ b/docs/schema-publishing.md @@ -1,6 +1,6 @@ # Schema Publishing (GitHub Pages) -This repository publishes the public model schema through GitHub Pages. +This repository publishes the public model and snapshot schemas through GitHub Pages. ## One-time repository setup @@ -16,28 +16,24 @@ After first deployment, GitHub creates the `github-pages` environment automatica - Trigger: `push` tags matching `schema-v*` - Optional emergency path: `workflow_dispatch` -The workflow: +The workflow should publish both schema families: -1. Reads `ExpectedSchemaVersion`, `ExpectedSchemaURI`, and `ExpectedSchemaDigest` from `internal/schema/constants.go`. -2. Validates tag/version binding (`schema-vX.Y.Z` must match `ExpectedSchemaVersion`). -3. Validates schema JSON and `$id` binding. -4. Builds a Pages artifact with: - - `schema/model/v/model.schema.json` - - `schema/model/latest/model.schema.json` - - `schema/index.json` -5. Deploys to GitHub Pages. +- `schema/model/v/model.schema.json` +- `schema/model/latest/model.schema.json` +- `schema/snapshot/v/snapshot.schema.json` +- `schema/snapshot/latest/snapshot.schema.json` +- `schema/index.json` ## Release operation model 1. Merge schema changes into `main`. -2. Create and push tag `schema-v` (for example, `schema-v1.0.0`). +2. Create and push a tag `schema-v` when a public schema changes. 3. Wait for workflow completion. -4. Verify: - - schema URL returns `200` - - downloaded schema digest matches `ExpectedSchemaDigest` +4. Verify each published URL returns `200`. +5. Verify the downloaded digest matches the pinned digest in `internal/schema/constants.go`. ## Notes -- This stage updates Bering only. -- Sheaft currently pins strict URI and digest independently and must be migrated in a separate coordinated change. - +- The stable model and the snapshot envelope are versioned independently by schema name. +- Downstream consumers such as Sheaft can remain pinned to the model schema while runtime consumers adopt the snapshot schema. +- Never silently mutate a published schema version in place. diff --git a/docs/trace-input-format.md b/docs/trace-input-format.md index 09dc5af..cfda435 100644 --- a/docs/trace-input-format.md +++ b/docs/trace-input-format.md @@ -1,6 +1,6 @@ -# Trace Input Format (MVP) +# Trace Input Format -Bering MVP supports two JSON input formats. +Bering supports two batch JSON formats and one runtime network ingest path. ## 1) Normalized spans JSON @@ -16,6 +16,8 @@ Top-level shape: "service": "frontend", "kind": "server", "name": "GET /checkout", + "start_time": "2026-03-11T12:00:00Z", + "end_time": "2026-03-11T12:00:00.050Z", "attributes": { "http.request.method": "GET", "http.route": "/checkout" @@ -31,8 +33,12 @@ Accepted aliases: - `spanId` for `span_id` - `parentSpanId` for `parent_span_id` - `service_name` or `service.name` for `service` +- `startTime` for `start_time` +- `endTime` for `end_time` -## 2) Raw OTel JSON payload +Timestamp values may be RFC3339 strings. + +## 2) Raw OTLP JSON payload Expected hierarchy: @@ -41,27 +47,44 @@ Expected hierarchy: - `resourceSpans[].scopeSpans[]` (or `instrumentationLibrarySpans[]`) - `...scopeSpans[].spans[]` -Supported OTel fields: +Supported OTLP fields: - `traceId`, `spanId`, `parentSpanId`, `name`, `kind` -- resource/span attributes in key/value form (`key` + `value.stringValue|intValue|doubleValue|boolValue`) +- `startTimeUnixNano`, `endTimeUnixNano` +- resource/span attributes in key/value form `service.name` is resolved from resource attributes first, then span attributes. +## 3) Runtime OTLP/HTTP ingest + +`bering serve` accepts OTLP/HTTP at `POST /v1/traces`. + +Supported request encodings: + +- protobuf (`application/x-protobuf`) +- JSON (`application/json`) +- optional `Content-Encoding: gzip` + +This is the primary integration path for any standard OpenTelemetry Collector or SDK exporter. + ## Discovery-relevant attributes -- HTTP endpoint inference: - - `http.request.method`, `http.method` - - `http.route`, `url.path`, `http.target` -- Async edge heuristic: - - `messaging.system` - - `messaging.destination` - - `messaging.operation` - - span kind `producer` or `consumer` +### HTTP endpoint inference + +- `http.request.method`, `http.method` +- `http.route`, `url.path`, `http.target` +- span name fallback such as `GET /checkout` + +### Async edge heuristic + +- span kind `producer` or `consumer` +- `messaging.system` +- `messaging.destination` +- `messaging.operation` ## Input mode -- `--input` can point to: - - a single JSON file - - a directory (all `*.json` files recursively, sorted by path) +`--input` can point to: +- a single JSON file +- a directory (all `*.json` files recursively, sorted by path) diff --git a/examples/collector/otelcol.sidecar.yaml b/examples/collector/otelcol.sidecar.yaml new file mode 100644 index 0000000..1e46ff0 --- /dev/null +++ b/examples/collector/otelcol.sidecar.yaml @@ -0,0 +1,21 @@ +receivers: + otlp: + protocols: + grpc: {} + http: {} + +processors: + batch: {} + +exporters: + otlphttp/bering: + endpoint: http://bering:4318 + tls: + insecure: true + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlphttp/bering] diff --git a/examples/grafana/bering-runtime-dashboard.json b/examples/grafana/bering-runtime-dashboard.json new file mode 100644 index 0000000..4f0020b --- /dev/null +++ b/examples/grafana/bering-runtime-dashboard.json @@ -0,0 +1,70 @@ +{ + "annotations": {"list": []}, + "editable": true, + "panels": [ + { + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "fieldConfig": {"defaults": {"unit": "ops"}, "overrides": []}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}, + "id": 1, + "options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "single"}}, + "targets": [ + {"expr": "rate(spans_ingested_total[5m])", "legendFormat": "ingested/s", "refId": "A"}, + {"expr": "rate(spans_dropped_total[5m])", "legendFormat": "dropped/s", "refId": "B"} + ], + "title": "Span Throughput", + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}, + "id": 2, + "options": {"legend": {"displayMode": "table", "placement": "bottom"}, "tooltip": {"mode": "single"}}, + "targets": [ + {"expr": "current_services", "legendFormat": "services", "refId": "A"}, + {"expr": "current_edges", "legendFormat": "edges", "refId": "B"}, + {"expr": "current_endpoints", "legendFormat": "endpoints", "refId": "C"} + ], + "title": "Current Topology Size", + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "fieldConfig": {"defaults": {"unit": "s"}, "overrides": []}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}, + "id": 3, + "options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "single"}}, + "targets": [ + {"expr": "snapshot_age_seconds", "legendFormat": "snapshot age", "refId": "A"}, + {"expr": "window_lag_seconds", "legendFormat": "window lag", "refId": "B"} + ], + "title": "Runtime Lag", + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}, + "id": 4, + "options": {"legend": {"displayMode": "table", "placement": "bottom"}, "tooltip": {"mode": "single"}}, + "targets": [ + {"expr": "diff_added_services", "legendFormat": "added services", "refId": "A"}, + {"expr": "diff_removed_services", "legendFormat": "removed services", "refId": "B"}, + {"expr": "diff_added_edges + diff_removed_edges", "legendFormat": "edge drift", "refId": "C"}, + {"expr": "diff_added_endpoints + diff_removed_endpoints", "legendFormat": "endpoint drift", "refId": "D"} + ], + "title": "Topology Drift", + "type": "timeseries" + } + ], + "refresh": "15s", + "schemaVersion": 39, + "style": "dark", + "tags": ["bering", "topology", "runtime"], + "templating": {"list": []}, + "time": {"from": "now-6h", "to": "now"}, + "timezone": "browser", + "title": "Bering Runtime Health", + "version": 1 +} diff --git a/examples/outputs/bering-snapshot.normalized.sample.json b/examples/outputs/bering-snapshot.normalized.sample.json new file mode 100644 index 0000000..c9b7a0d --- /dev/null +++ b/examples/outputs/bering-snapshot.normalized.sample.json @@ -0,0 +1,394 @@ +{ + "counts": { + "edges": 3, + "endpoints": 3, + "services": 4 + }, + "coverage": { + "confidence": 0.94, + "edge_support_min": 1, + "endpoint_support_min": 1, + "service_support_min": 1 + }, + "diff": { + "added_edges": 3, + "added_endpoints": 3, + "added_services": 4, + "changed_edges": 0, + "changed_endpoints": 0, + "changed_services": 0, + "removed_edges": 0, + "removed_endpoints": 0, + "removed_services": 0 + }, + "discovery": { + "edges": [ + { + "blocking": true, + "from": "checkout", + "id": "checkout|inventory|sync|true", + "kind": "sync", + "metadata": {}, + "provenance": [ + { + "connector": "trace_file", + "ref": "bering://discover?input=examples%2Ftraces%2Fnormalized.sample.json", + "type": "traces" + } + ], + "support": { + "evidence": [ + "default_sync" + ], + "observations": 1, + "trace_count": 1 + }, + "to": "inventory" + }, + { + "blocking": false, + "from": "checkout", + "id": "checkout|payment|async|false", + "kind": "async", + "metadata": {}, + "provenance": [ + { + "connector": "trace_file", + "ref": "bering://discover?input=examples%2Ftraces%2Fnormalized.sample.json", + "type": "traces" + } + ], + "support": { + "evidence": [ + "child_async_signal", + "messaging.destination", + "messaging.system", + "span.kind=consumer" + ], + "observations": 1, + "trace_count": 1 + }, + "to": "payment" + }, + { + "blocking": true, + "from": "frontend", + "id": "frontend|checkout|sync|true", + "kind": "sync", + "metadata": { + "labels": { + "flow": "checkout" + }, + "weight": 0.8 + }, + "provenance": [ + { + "connector": "trace_file", + "ref": "bering://discover?input=examples%2Ftraces%2Fnormalized.sample.json", + "type": "traces" + }, + { + "name": "sample-discovery-overlay", + "precedence": 1, + "ref": "configs\\discovery.overlay.sample.yaml", + "type": "overlay" + } + ], + "support": { + "evidence": [ + "default_sync" + ], + "observations": 1, + "trace_count": 1 + }, + "to": "checkout" + } + ], + "endpoints": [ + { + "entry_service": "checkout", + "id": "checkout:POST /process", + "metadata": { + "labels": { + "workflow": "purchase" + }, + "predicate_ref": "catalog.checkout.success", + "weight": 0.6 + }, + "method": "POST", + "path": "/process", + "provenance": [ + { + "connector": "trace_file", + "ref": "bering://discover?input=examples%2Ftraces%2Fnormalized.sample.json", + "type": "traces" + }, + { + "name": "sample-discovery-overlay", + "precedence": 1, + "ref": "configs\\discovery.overlay.sample.yaml", + "type": "overlay" + } + ], + "support": { + "observations": 1, + "trace_count": 1 + } + }, + { + "entry_service": "frontend", + "id": "frontend:GET /checkout", + "metadata": { + "labels": { + "workflow": "browse" + }, + "weight": 0.4 + }, + "method": "GET", + "path": "/checkout", + "provenance": [ + { + "connector": "trace_file", + "ref": "bering://discover?input=examples%2Ftraces%2Fnormalized.sample.json", + "type": "traces" + }, + { + "name": "sample-discovery-overlay", + "precedence": 1, + "ref": "configs\\discovery.overlay.sample.yaml", + "type": "overlay" + } + ], + "support": { + "observations": 1, + "trace_count": 1 + } + }, + { + "entry_service": "frontend", + "id": "frontend:GET /health", + "metadata": {}, + "method": "GET", + "path": "/health", + "provenance": [ + { + "connector": "trace_file", + "ref": "bering://discover?input=examples%2Ftraces%2Fnormalized.sample.json", + "type": "traces" + } + ], + "support": { + "observations": 1, + "trace_count": 1 + } + } + ], + "overlays": [ + { + "name": "sample-discovery-overlay", + "precedence": 1, + "ref": "configs\\discovery.overlay.sample.yaml" + } + ], + "services": [ + { + "id": "checkout", + "metadata": { + "failure_eligible": true, + "labels": { + "team": "commerce", + "tier": "backend" + }, + "replicas_override": 3, + "slo_refs": [ + "slo:checkout-latency" + ] + }, + "name": "checkout", + "provenance": [ + { + "connector": "trace_file", + "ref": "bering://discover?input=examples%2Ftraces%2Fnormalized.sample.json", + "type": "traces" + }, + { + "name": "sample-discovery-overlay", + "precedence": 1, + "ref": "configs\\discovery.overlay.sample.yaml", + "type": "overlay" + } + ], + "replicas": 3, + "support": { + "observations": 1, + "trace_count": 1 + } + }, + { + "id": "frontend", + "metadata": {}, + "name": "frontend", + "provenance": [ + { + "connector": "trace_file", + "ref": "bering://discover?input=examples%2Ftraces%2Fnormalized.sample.json", + "type": "traces" + } + ], + "replicas": 1, + "support": { + "observations": 2, + "trace_count": 2 + } + }, + { + "id": "inventory", + "metadata": {}, + "name": "inventory", + "provenance": [ + { + "connector": "trace_file", + "ref": "bering://discover?input=examples%2Ftraces%2Fnormalized.sample.json", + "type": "traces" + } + ], + "replicas": 1, + "support": { + "observations": 1, + "trace_count": 1 + } + }, + { + "id": "payment", + "metadata": { + "labels": { + "team": "payments", + "tier": "backend" + } + }, + "name": "payment", + "provenance": [ + { + "connector": "trace_file", + "ref": "bering://discover?input=examples%2Ftraces%2Fnormalized.sample.json", + "type": "traces" + }, + { + "name": "sample-discovery-overlay", + "precedence": 1, + "ref": "configs\\discovery.overlay.sample.yaml", + "type": "overlay" + } + ], + "replicas": 1, + "support": { + "observations": 1, + "trace_count": 1 + } + } + ] + }, + "ingest": { + "dropped_spans": 0, + "late_spans": 0, + "spans": 5, + "traces": 2 + }, + "metadata": { + "confidence": 0.94, + "emitted_at": "2026-03-03T00:00:00Z", + "schema": { + "digest": "sha256:87e4e887ed4a37b72f6136e268b73552eccb92941c4de2c6f3a514dd066ea972", + "name": "io.mb3r.bering.snapshot", + "uri": "https://mb3r-lab.github.io/Bering/schema/snapshot/v1.0.0/snapshot.schema.json", + "version": "1.0.0" + }, + "source_ref": "bering://discover?input=examples%2Ftraces%2Fnormalized.sample.json", + "source_type": "bering" + }, + "model": { + "edges": [ + { + "blocking": true, + "from": "checkout", + "kind": "sync", + "to": "inventory" + }, + { + "blocking": false, + "from": "checkout", + "kind": "async", + "to": "payment" + }, + { + "blocking": true, + "from": "frontend", + "kind": "sync", + "to": "checkout" + } + ], + "endpoints": [ + { + "entry_service": "checkout", + "id": "checkout:POST /process", + "success_predicate_ref": "catalog.checkout.success" + }, + { + "entry_service": "frontend", + "id": "frontend:GET /checkout", + "success_predicate_ref": "frontend:GET /checkout" + }, + { + "entry_service": "frontend", + "id": "frontend:GET /health", + "success_predicate_ref": "frontend:GET /health" + } + ], + "metadata": { + "confidence": 0.94, + "discovered_at": "2026-03-03T00:00:00Z", + "schema": { + "digest": "sha256:272277c093f37580adcd2dded225bd37c86539d642d7910baad7e4228227d1a7", + "name": "io.mb3r.bering.model", + "uri": "https://mb3r-lab.github.io/Bering/schema/model/v1.0.0/model.schema.json", + "version": "1.0.0" + }, + "source_ref": "bering://discover?input=examples%2Ftraces%2Fnormalized.sample.json", + "source_type": "bering" + }, + "services": [ + { + "id": "checkout", + "name": "checkout", + "replicas": 3 + }, + { + "id": "frontend", + "name": "frontend", + "replicas": 1 + }, + { + "id": "inventory", + "name": "inventory", + "replicas": 1 + }, + { + "id": "payment", + "name": "payment", + "replicas": 1 + } + ] + }, + "snapshot_id": "snap-87b9684332d99fa3564dea4f", + "sources": [ + { + "connector": "trace_file", + "observations": 5, + "ref": "bering://discover?input=examples%2Ftraces%2Fnormalized.sample.json", + "type": "traces" + } + ], + "topology_version": "sha256:b20ebd677050d035b78b26a7d915a5d459a187dc7e2f3288d9e4ec32e24ae610", + "window_end": "2026-03-03T00:00:00Z", + "window_start": "2026-03-03T00:00:00Z" +} diff --git a/examples/prometheus/bering.prometheus.yml b/examples/prometheus/bering.prometheus.yml new file mode 100644 index 0000000..61b52b4 --- /dev/null +++ b/examples/prometheus/bering.prometheus.yml @@ -0,0 +1,10 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: bering + metrics_path: /metrics + static_configs: + - targets: + - bering:4318 diff --git a/go.mod b/go.mod index c7811db..5e04321 100644 --- a/go.mod +++ b/go.mod @@ -1,8 +1,29 @@ module github.com/MB3R-Lab/Bering -go 1.23 +go 1.24.0 require ( + github.com/prometheus/client_golang v1.23.2 github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 + go.opentelemetry.io/proto/otlp v1.9.0 + google.golang.org/protobuf v1.36.11 gopkg.in/yaml.v3 v3.0.1 ) + +require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 // indirect + github.com/kr/text v0.2.0 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.66.1 // indirect + github.com/prometheus/procfs v0.16.1 // indirect + go.yaml.in/yaml/v2 v2.4.2 // indirect + golang.org/x/net v0.50.0 // indirect + golang.org/x/sys v0.41.0 // indirect + golang.org/x/text v0.34.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 // indirect + google.golang.org/grpc v1.75.1 // indirect +) diff --git a/go.sum b/go.sum index 70d82da..592b998 100644 --- a/go.sum +++ b/go.sum @@ -1,6 +1,84 @@ +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 h1:NmZ1PKzSTQbuGHw9DGPFomqkkLWMC+vZCkfs+FHv1Vg= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3/go.mod h1:zQrxl1YP88HQlA6i9c63DSVPFklWpGX4OWAc9bFuaH4= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= +github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= +github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= +github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= +github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 h1:lZUw3E0/J3roVtGQ+SCrUrg3ON6NgVqpn3+iol9aGu4= github.com/santhosh-tekuri/jsonschema/v5 v5.3.1/go.mod h1:uToXkOrWAZ6/Oc07xWQrPOhJotwFIyu2bBVN41fcDUY= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= +go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= +go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ= +go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I= +go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= +go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= +go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= +go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= +go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= +go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= +go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= +go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= +go.opentelemetry.io/proto/otlp v1.9.0 h1:l706jCMITVouPOqEnii2fIAuO3IVGBRPV5ICjceRb/A= +go.opentelemetry.io/proto/otlp v1.9.0/go.mod h1:xE+Cx5E/eEHw+ISFkwPLwCZefwVjY+pqKg1qcK03+/4= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= +go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= +golang.org/x/net v0.50.0 h1:ucWh9eiCGyDR3vtzso0WMQinm2Dnt8cFMuQa9K33J60= +golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM= +golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= +golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= +golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57 h1:JLQynH/LBHfCTSbDWl+py8C+Rg/k1OVH3xfcaiANuF0= +google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:kSJwQxqmFXeo79zOmbrALdflXQeAYcUbgS7PbpMknCY= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 h1:mWPCjDEyshlQYzBpMNHaEof6UX1PmHcaUODUywQ0uac= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ= +google.golang.org/grpc v1.75.1 h1:/ODCNEuf9VghjgO3rqLcfg8fiOP0nSluljWFlDxELLI= +google.golang.org/grpc v1.75.1/go.mod h1:JtPAzKiq4v1xcAB2hydNlWI2RnF85XXcV0mhKXr2ecQ= +google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= +google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/app/app.go b/internal/app/app.go index b8897fc..394687f 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -1,18 +1,26 @@ package app import ( + "context" "flag" "fmt" "io" + "log/slog" "os" + "os/signal" "path/filepath" "strings" + "syscall" "time" + "github.com/MB3R-Lab/Bering/internal/config" "github.com/MB3R-Lab/Bering/internal/connectors/traces" "github.com/MB3R-Lab/Bering/internal/discovery" "github.com/MB3R-Lab/Bering/internal/model" + "github.com/MB3R-Lab/Bering/internal/overlay" + beringruntime "github.com/MB3R-Lab/Bering/internal/runtime" "github.com/MB3R-Lab/Bering/internal/schema" + "github.com/MB3R-Lab/Bering/internal/snapshot" ) const ( @@ -45,6 +53,8 @@ func (r Runner) Run(args []string) int { return r.runDiscover(args[1:]) case "validate": return r.runValidate(args[1:]) + case "serve": + return r.runServe(args[1:]) case "help", "--help", "-h": r.printUsage() return ExitOK @@ -61,8 +71,11 @@ func (r Runner) runDiscover(args []string) int { input := fs.String("input", "", "Path to trace input file or directory") out := fs.String("out", "bering-model.json", "Path to output model JSON") + snapshotOut := fs.String("snapshot-out", "", "Optional path to output snapshot envelope JSON") replicas := fs.String("replicas", "", "Path to replicas override file (yaml or json)") discoveredAt := fs.String("discovered-at", "", "RFC3339 timestamp override for metadata.discovered_at") + var overlays stringSliceFlag + fs.Var(&overlays, "overlay", "Path to a discovery overlay file (yaml or json); may be repeated") if err := fs.Parse(args); err != nil { r.printfErr("discover flag parse error: %v\n", err) @@ -88,6 +101,12 @@ func (r Runner) runDiscover(args []string) int { return ExitError } + loadedOverlays, err := overlay.LoadFiles(overlays.Values()) + if err != nil { + r.printfErr("load overlays: %v\n", err) + return ExitError + } + override := map[string]int{} if strings.TrimSpace(*replicas) != "" { override, err = traces.LoadReplicasOverride(*replicas) @@ -97,27 +116,27 @@ func (r Runner) runDiscover(args []string) int { } } - mdl, err := discovery.Build(spans, discovery.Options{ - SourceRef: discovery.BuildSourceRef(*input), + sourceRef := discovery.BuildSourceRef(*input) + result, err := discovery.Discover(spans, discovery.Options{ + SourceRef: sourceRef, DiscoveredAt: discoveredAtValue, ReplicasOverride: override, + Overlays: loadedOverlays, }) if err != nil { r.printfErr("discover model: %v\n", err) return ExitError } - raw, err := model.MarshalCanonical(mdl) + raw, err := model.MarshalCanonical(result.Model) if err != nil { r.printfErr("serialize model: %v\n", err) return ExitError } - if err := schema.ValidateJSON(raw); err != nil { r.printfErr("post-discovery model validation failed: %v\n", err) return ExitError } - if err := os.MkdirAll(filepath.Dir(*out), 0o755); err != nil { r.printfErr("create output directory: %v\n", err) return ExitError @@ -127,8 +146,34 @@ func (r Runner) runDiscover(args []string) int { return ExitError } + if strings.TrimSpace(*snapshotOut) != "" { + env, err := buildBatchSnapshot(result, discoveredAtValue) + if err != nil { + r.printfErr("build snapshot output: %v\n", err) + return ExitError + } + rawSnapshot, err := snapshot.MarshalCanonical(env) + if err != nil { + r.printfErr("serialize snapshot output: %v\n", err) + return ExitError + } + if err := schema.ValidateSnapshotJSON(rawSnapshot); err != nil { + r.printfErr("post-discovery snapshot validation failed: %v\n", err) + return ExitError + } + if err := os.MkdirAll(filepath.Dir(*snapshotOut), 0o755); err != nil { + r.printfErr("create snapshot output directory: %v\n", err) + return ExitError + } + if err := os.WriteFile(*snapshotOut, rawSnapshot, 0o644); err != nil { + r.printfErr("write snapshot output: %v\n", err) + return ExitError + } + r.printf("snapshot written: %s\n", *snapshotOut) + } + r.printf("model written: %s\n", *out) - r.printf("services=%d edges=%d endpoints=%d confidence=%.2f\n", len(mdl.Services), len(mdl.Edges), len(mdl.Endpoints), mdl.Metadata.Confidence) + r.printf("services=%d edges=%d endpoints=%d confidence=%.2f\n", len(result.Model.Services), len(result.Model.Edges), len(result.Model.Endpoints), result.Model.Metadata.Confidence) return ExitOK } @@ -136,7 +181,7 @@ func (r Runner) runValidate(args []string) int { fs := flag.NewFlagSet("validate", flag.ContinueOnError) fs.SetOutput(io.Discard) - input := fs.String("input", "", "Path to Bering model JSON") + input := fs.String("input", "", "Path to Bering model or snapshot JSON") if err := fs.Parse(args); err != nil { r.printfErr("validate flag parse error: %v\n", err) @@ -149,20 +194,123 @@ func (r Runner) runValidate(args []string) int { raw, err := os.ReadFile(*input) if err != nil { - r.printfErr("read model file: %v\n", err) + r.printfErr("read artifact file: %v\n", err) return ExitError } - if err := schema.ValidateJSON(raw); err != nil { + if err := schema.ValidateArtifactJSON(raw); err != nil { r.printfErr("contract validation failed: %v\n", err) return ExitError } - if _, err := model.ParseJSON(raw); err != nil { - r.printfErr("semantic validation failed: %v\n", err) + ref, err := schema.ExtractSchemaRef(raw) + if err != nil { + r.printfErr("extract schema ref: %v\n", err) + return ExitError + } + switch ref.Name { + case schema.ExpectedSchemaName: + if _, err := model.ParseJSON(raw); err != nil { + r.printfErr("semantic validation failed: %v\n", err) + return ExitError + } + case schema.ExpectedSnapshotSchemaName: + if _, err := snapshot.ParseJSON(raw); err != nil { + r.printfErr("semantic validation failed: %v\n", err) + return ExitError + } + default: + r.printfErr("unsupported artifact schema: %s\n", ref.Name) return ExitError } - r.printf("model is valid: %s\n", *input) + r.printf("artifact is valid: %s\n", *input) + return ExitOK +} + +func (r Runner) runServe(args []string) int { + fs := flag.NewFlagSet("serve", flag.ContinueOnError) + fs.SetOutput(io.Discard) + + configPath := fs.String("config", "", "Path to serve config YAML/JSON") + listen := fs.String("listen", "", "Override config server listen address") + flushInterval := fs.String("flush-interval", "", "Override config runtime flush interval (e.g. 5s)") + windowSize := fs.String("window-size", "", "Override config runtime window size (e.g. 30s)") + maxInMemory := fs.Int("max-in-memory-spans", -1, "Override config runtime max in-memory spans") + latePolicy := fs.String("late-span-policy", "", "Override config runtime late span policy (drop|current_window)") + sinkDir := fs.String("sink-dir", "", "Override config sink directory") + latestPath := fs.String("latest-path", "", "Override config stable latest snapshot path") + logFormat := fs.String("log-format", "", "Override log format (text|json)") + var overlays stringSliceFlag + fs.Var(&overlays, "overlay", "Override overlay file list; may be repeated") + + if err := fs.Parse(args); err != nil { + r.printfErr("serve flag parse error: %v\n", err) + return ExitError + } + + cfg, err := config.LoadServeConfig(*configPath) + if err != nil { + r.printfErr("load config: %v\n", err) + return ExitError + } + if strings.TrimSpace(*listen) != "" { + cfg.Server.ListenAddress = strings.TrimSpace(*listen) + } + if strings.TrimSpace(*flushInterval) != "" { + parsed, err := time.ParseDuration(strings.TrimSpace(*flushInterval)) + if err != nil { + r.printfErr("parse --flush-interval: %v\n", err) + return ExitError + } + cfg.Runtime.FlushInterval = config.Duration(parsed) + } + if strings.TrimSpace(*windowSize) != "" { + parsed, err := time.ParseDuration(strings.TrimSpace(*windowSize)) + if err != nil { + r.printfErr("parse --window-size: %v\n", err) + return ExitError + } + cfg.Runtime.WindowSize = config.Duration(parsed) + } + if *maxInMemory > 0 { + cfg.Runtime.MaxInMemorySpans = *maxInMemory + } + if strings.TrimSpace(*latePolicy) != "" { + cfg.Runtime.LateSpanPolicy = strings.TrimSpace(*latePolicy) + } + if strings.TrimSpace(*sinkDir) != "" { + cfg.Sink.Directory = strings.TrimSpace(*sinkDir) + } + if strings.TrimSpace(*latestPath) != "" { + cfg.Sink.LatestPath = strings.TrimSpace(*latestPath) + } + if len(overlays.items) > 0 { + cfg.Overlays = overlays.Values() + } + if err := cfg.Validate(); err != nil { + r.printfErr("invalid config: %v\n", err) + return ExitError + } + + loadedOverlays, err := overlay.LoadFiles(cfg.Overlays) + if err != nil { + r.printfErr("load overlays: %v\n", err) + return ExitError + } + + logger := newLogger(r.stderr, cfg.Logging.Structured, strings.TrimSpace(*logFormat)) + ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) + defer stop() + + service, err := beringruntime.NewService(cfg, loadedOverlays, logger) + if err != nil { + r.printfErr("create service: %v\n", err) + return ExitError + } + if err := service.Run(ctx); err != nil { + r.printfErr("serve failed: %v\n", err) + return ExitError + } return ExitOK } @@ -170,8 +318,9 @@ func (r Runner) printUsage() { fmt.Fprintln(r.stdout, "Bering CLI") fmt.Fprintln(r.stdout) fmt.Fprintln(r.stdout, "Usage:") - fmt.Fprintln(r.stdout, " bering discover --input [--out bering-model.json] [--replicas replicas.yaml|json] [--discovered-at RFC3339]") - fmt.Fprintln(r.stdout, " bering validate --input ") + fmt.Fprintln(r.stdout, " bering discover --input [--out bering-model.json] [--snapshot-out bering-snapshot.json] [--replicas replicas.yaml|json] [--overlay overlay.yaml] [--discovered-at RFC3339]") + fmt.Fprintln(r.stdout, " bering validate --input ") + fmt.Fprintln(r.stdout, " bering serve --config configs/serve.sample.yaml [--listen :8080] [--window-size 30s] [--flush-interval 5s]") } func (r Runner) printf(format string, args ...any) { @@ -181,3 +330,79 @@ func (r Runner) printf(format string, args ...any) { func (r Runner) printfErr(format string, args ...any) { fmt.Fprintf(r.stderr, format, args...) } + +type stringSliceFlag struct { + items []string +} + +func (f *stringSliceFlag) String() string { + return strings.Join(f.items, ",") +} + +func (f *stringSliceFlag) Set(value string) error { + trimmed := strings.TrimSpace(value) + if trimmed == "" { + return fmt.Errorf("value cannot be empty") + } + f.items = append(f.items, trimmed) + return nil +} + +func (f *stringSliceFlag) Values() []string { + return append([]string(nil), f.items...) +} + +func newLogger(w io.Writer, configStructured bool, formatOverride string) *slog.Logger { + structured := configStructured + switch strings.ToLower(strings.TrimSpace(formatOverride)) { + case "json": + structured = true + case "text": + structured = false + } + if structured { + return slog.New(slog.NewJSONHandler(w, &slog.HandlerOptions{Level: slog.LevelInfo})) + } + return slog.New(slog.NewTextHandler(w, &slog.HandlerOptions{Level: slog.LevelInfo})) +} + +func buildBatchSnapshot(result discovery.Result, discoveredAt string) (snapshot.Envelope, error) { + topologyVersion, err := snapshot.TopologyDigest(result.Model) + if err != nil { + return snapshot.Envelope{}, err + } + env := snapshot.Envelope{ + SnapshotID: snapshot.BuildSnapshotID(discoveredAt, discoveredAt, topologyVersion), + TopologyVersion: topologyVersion, + WindowStart: discoveredAt, + WindowEnd: discoveredAt, + Ingest: snapshot.IngestSummary{ + Spans: result.SpanCount, + Traces: result.TraceCount, + }, + Counts: snapshot.Counts{ + Services: len(result.Model.Services), + Edges: len(result.Model.Edges), + Endpoints: len(result.Model.Endpoints), + }, + Coverage: result.Coverage, + Sources: result.Sources, + Discovery: result.Discovery, + Model: result.Model, + Metadata: snapshot.Metadata{ + SourceType: discovery.SourceTypeBering, + SourceRef: result.Model.Metadata.SourceRef, + EmittedAt: discoveredAt, + Confidence: result.Coverage.Confidence, + Schema: model.SchemaRef{ + Name: schema.ExpectedSnapshotSchemaName, + Version: schema.ExpectedSnapshotSchemaVersion, + URI: schema.ExpectedSnapshotSchemaURI, + Digest: schema.ExpectedSnapshotSchemaDigest, + }, + }, + } + env.Diff = snapshot.ComputeDiff(nil, env) + env.SortDeterministic() + return env, nil +} diff --git a/internal/app/app_test.go b/internal/app/app_test.go index 647e8c8..86f926c 100644 --- a/internal/app/app_test.go +++ b/internal/app/app_test.go @@ -108,6 +108,59 @@ func TestDiscoverAndValidate_OTLPFixture(t *testing.T) { } } +func TestDiscoverSnapshotWithOverlay(t *testing.T) { + t.Parallel() + + root := repoRoot(t) + prevWD, err := os.Getwd() + if err != nil { + t.Fatalf("getwd: %v", err) + } + if err := os.Chdir(root); err != nil { + t.Fatalf("chdir root: %v", err) + } + defer func() { + _ = os.Chdir(prevWD) + }() + + input := filepath.Join("examples", "traces", "normalized.sample.json") + out := filepath.Join(t.TempDir(), "bering-model.json") + snapshotOut := filepath.Join(t.TempDir(), "bering-snapshot.json") + overlayPath := filepath.Join(t.TempDir(), "overlay.yaml") + overlayRaw := []byte(`name: test-overlay +services: + - id: checkout + replicas: 3 +endpoints: + - id: checkout:POST /process + predicate_ref: catalog.checkout.success +`) + if err := os.WriteFile(overlayPath, overlayRaw, 0o644); err != nil { + t.Fatalf("write overlay fixture: %v", err) + } + + var stdout, stderr bytes.Buffer + runner := NewRunner(&stdout, &stderr) + exitCode := runner.Run([]string{ + "discover", + "--input", input, + "--out", out, + "--snapshot-out", snapshotOut, + "--overlay", overlayPath, + "--discovered-at", "2026-03-03T00:00:00Z", + }) + if exitCode != ExitOK { + t.Fatalf("discover failed (exit=%d): stderr=%s", exitCode, stderr.String()) + } + + stdout.Reset() + stderr.Reset() + exitCode = runner.Run([]string{"validate", "--input", snapshotOut}) + if exitCode != ExitOK { + t.Fatalf("validate snapshot failed (exit=%d): stderr=%s", exitCode, stderr.String()) + } +} + func repoRoot(t *testing.T) string { t.Helper() _, thisFile, _, ok := runtime.Caller(0) diff --git a/internal/config/serve.go b/internal/config/serve.go new file mode 100644 index 0000000..52a88a2 --- /dev/null +++ b/internal/config/serve.go @@ -0,0 +1,156 @@ +package config + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + "time" + + "gopkg.in/yaml.v3" +) + +type Duration time.Duration + +func (d *Duration) UnmarshalJSON(data []byte) error { + var asString string + if err := json.Unmarshal(data, &asString); err == nil { + parsed, err := time.ParseDuration(asString) + if err != nil { + return fmt.Errorf("parse duration %q: %w", asString, err) + } + *d = Duration(parsed) + return nil + } + var asInt int64 + if err := json.Unmarshal(data, &asInt); err == nil { + *d = Duration(time.Duration(asInt)) + return nil + } + return fmt.Errorf("duration must be a string or integer nanoseconds") +} + +func (d *Duration) UnmarshalYAML(value *yaml.Node) error { + if value == nil { + return nil + } + if value.Kind == yaml.ScalarNode { + var asString string + if err := value.Decode(&asString); err == nil { + parsed, err := time.ParseDuration(asString) + if err != nil { + return fmt.Errorf("parse duration %q: %w", asString, err) + } + *d = Duration(parsed) + return nil + } + var asInt int64 + if err := value.Decode(&asInt); err == nil { + *d = Duration(time.Duration(asInt)) + return nil + } + } + return fmt.Errorf("duration must be a scalar string or integer") +} + +func (d Duration) Duration() time.Duration { + return time.Duration(d) +} + +type ServeConfig struct { + Server ServerConfig `json:"server" yaml:"server"` + Runtime RuntimeConfig `json:"runtime" yaml:"runtime"` + Sink SinkConfig `json:"sink" yaml:"sink"` + Logging LoggingConfig `json:"logging" yaml:"logging"` + Overlays []string `json:"overlays" yaml:"overlays"` +} + +type ServerConfig struct { + ListenAddress string `json:"listen_address" yaml:"listen_address"` + MaxRequestBytes int64 `json:"max_request_bytes" yaml:"max_request_bytes"` +} + +type RuntimeConfig struct { + FlushInterval Duration `json:"flush_interval" yaml:"flush_interval"` + WindowSize Duration `json:"window_size" yaml:"window_size"` + MaxInMemorySpans int `json:"max_in_memory_spans" yaml:"max_in_memory_spans"` + LateSpanPolicy string `json:"late_span_policy" yaml:"late_span_policy"` +} + +type SinkConfig struct { + Directory string `json:"directory" yaml:"directory"` + LatestPath string `json:"latest_path" yaml:"latest_path"` +} + +type LoggingConfig struct { + Structured bool `json:"structured" yaml:"structured"` +} + +func DefaultServeConfig() ServeConfig { + return ServeConfig{ + Server: ServerConfig{ + ListenAddress: ":8080", + MaxRequestBytes: 5 << 20, + }, + Runtime: RuntimeConfig{ + FlushInterval: Duration(5 * time.Second), + WindowSize: Duration(30 * time.Second), + MaxInMemorySpans: 10000, + LateSpanPolicy: "drop", + }, + Sink: SinkConfig{ + Directory: "out/snapshots", + }, + } +} + +func LoadServeConfig(path string) (ServeConfig, error) { + cfg := DefaultServeConfig() + trimmed := strings.TrimSpace(path) + if trimmed == "" { + return cfg, cfg.Validate() + } + raw, err := os.ReadFile(trimmed) + if err != nil { + return ServeConfig{}, fmt.Errorf("read config file: %w", err) + } + switch strings.ToLower(filepath.Ext(trimmed)) { + case ".json": + if err := json.Unmarshal(raw, &cfg); err != nil { + return ServeConfig{}, fmt.Errorf("decode config json: %w", err) + } + default: + if err := yaml.Unmarshal(raw, &cfg); err != nil { + return ServeConfig{}, fmt.Errorf("decode config yaml: %w", err) + } + } + return cfg, cfg.Validate() +} + +func (c ServeConfig) Validate() error { + if strings.TrimSpace(c.Server.ListenAddress) == "" { + return fmt.Errorf("server.listen_address cannot be empty") + } + if c.Server.MaxRequestBytes <= 0 { + return fmt.Errorf("server.max_request_bytes must be > 0") + } + if c.Runtime.FlushInterval.Duration() <= 0 { + return fmt.Errorf("runtime.flush_interval must be > 0") + } + if c.Runtime.WindowSize.Duration() <= 0 { + return fmt.Errorf("runtime.window_size must be > 0") + } + if c.Runtime.MaxInMemorySpans <= 0 { + return fmt.Errorf("runtime.max_in_memory_spans must be > 0") + } + switch strings.ToLower(strings.TrimSpace(c.Runtime.LateSpanPolicy)) { + case "drop", "current_window": + default: + return fmt.Errorf("runtime.late_span_policy must be one of: drop, current_window") + } + if strings.TrimSpace(c.Sink.Directory) == "" { + return fmt.Errorf("sink.directory cannot be empty") + } + return nil +} diff --git a/internal/config/serve_test.go b/internal/config/serve_test.go new file mode 100644 index 0000000..ba2c9ea --- /dev/null +++ b/internal/config/serve_test.go @@ -0,0 +1,67 @@ +package config + +import ( + "os" + "path/filepath" + "testing" + "time" +) + +func TestLoadServeConfigYAML(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + path := filepath.Join(dir, "serve.yaml") + raw := []byte(`server: + listen_address: "127.0.0.1:4318" + max_request_bytes: 1048576 +runtime: + flush_interval: 2s + window_size: 15s + max_in_memory_spans: 500 + late_span_policy: current_window +sink: + directory: out/runtime + latest_path: out/runtime/latest.json +logging: + structured: true +overlays: + - configs/discovery.overlay.yaml +`) + if err := os.WriteFile(path, raw, 0o644); err != nil { + t.Fatalf("write config fixture: %v", err) + } + + cfg, err := LoadServeConfig(path) + if err != nil { + t.Fatalf("LoadServeConfig returned error: %v", err) + } + if cfg.Server.ListenAddress != "127.0.0.1:4318" { + t.Fatalf("listen address mismatch: %s", cfg.Server.ListenAddress) + } + if got, want := cfg.Runtime.FlushInterval.Duration(), 2*time.Second; got != want { + t.Fatalf("flush interval mismatch: got=%s want=%s", got, want) + } + if got, want := cfg.Runtime.WindowSize.Duration(), 15*time.Second; got != want { + t.Fatalf("window size mismatch: got=%s want=%s", got, want) + } + if got, want := cfg.Runtime.LateSpanPolicy, "current_window"; got != want { + t.Fatalf("late span policy mismatch: got=%s want=%s", got, want) + } + if !cfg.Logging.Structured { + t.Fatal("expected structured logging to be enabled") + } + if got, want := len(cfg.Overlays), 1; got != want { + t.Fatalf("overlay count mismatch: got=%d want=%d", got, want) + } +} + +func TestServeConfigValidateRejectsInvalidLatePolicy(t *testing.T) { + t.Parallel() + + cfg := DefaultServeConfig() + cfg.Runtime.LateSpanPolicy = "unknown" + if err := cfg.Validate(); err == nil { + t.Fatal("expected validation error for invalid late policy") + } +} diff --git a/internal/connectors/otlp/http.go b/internal/connectors/otlp/http.go new file mode 100644 index 0000000..269cf62 --- /dev/null +++ b/internal/connectors/otlp/http.go @@ -0,0 +1,172 @@ +package otlp + +import ( + "compress/gzip" + "fmt" + "io" + "net/http" + "strings" + "time" + + collecttracev1 "go.opentelemetry.io/proto/otlp/collector/trace/v1" + commonv1 "go.opentelemetry.io/proto/otlp/common/v1" + resourcev1 "go.opentelemetry.io/proto/otlp/resource/v1" + tracev1 "go.opentelemetry.io/proto/otlp/trace/v1" + "google.golang.org/protobuf/encoding/protojson" + "google.golang.org/protobuf/proto" + + "github.com/MB3R-Lab/Bering/internal/connectors/traces" +) + +const ConnectorName = "otlp_http" + +func DecodeHTTPRequest(r *http.Request, maxBytes int64) ([]traces.Span, error) { + body, err := readBody(r, maxBytes) + if err != nil { + return nil, err + } + return DecodePayload(r.Header.Get("Content-Type"), body) +} + +func DecodePayload(contentType string, body []byte) ([]traces.Span, error) { + var req collecttracev1.ExportTraceServiceRequest + if isJSONContentType(contentType) { + if err := protojson.Unmarshal(body, &req); err != nil { + return nil, fmt.Errorf("decode otlp json: %w", err) + } + } else { + if err := proto.Unmarshal(body, &req); err != nil { + return nil, fmt.Errorf("decode otlp protobuf: %w", err) + } + } + return normalizeRequest(&req), nil +} + +func normalizeRequest(req *collecttracev1.ExportTraceServiceRequest) []traces.Span { + out := []traces.Span{} + for _, resourceSpans := range req.GetResourceSpans() { + resourceAttrs := resourceAttributes(resourceSpans.GetResource()) + for _, scopeSpans := range resourceSpans.GetScopeSpans() { + for _, span := range scopeSpans.GetSpans() { + attrs := attributeMap(span.GetAttributes()) + merged := map[string]any{} + for key, value := range resourceAttrs { + merged[key] = value + } + for key, value := range attrs { + merged[key] = value + } + service, _ := merged["service.name"].(string) + out = append(out, traces.Span{ + TraceID: traceIDHex(span.GetTraceId()), + SpanID: traceIDHex(span.GetSpanId()), + ParentSpanID: traceIDHex(span.GetParentSpanId()), + Service: strings.TrimSpace(service), + Name: strings.TrimSpace(span.GetName()), + Kind: mapSpanKind(span.GetKind()), + StartTime: time.Unix(0, int64(span.GetStartTimeUnixNano())).UTC(), + EndTime: time.Unix(0, int64(span.GetEndTimeUnixNano())).UTC(), + Attributes: merged, + }) + } + } + } + return out +} + +func resourceAttributes(resource *resourcev1.Resource) map[string]any { + if resource == nil { + return map[string]any{} + } + return attributeMap(resource.GetAttributes()) +} + +func attributeMap(attrs []*commonv1.KeyValue) map[string]any { + out := make(map[string]any, len(attrs)) + for _, item := range attrs { + out[item.GetKey()] = anyValue(item.GetValue()) + } + return out +} + +func anyValue(value *commonv1.AnyValue) any { + if value == nil { + return nil + } + switch typed := value.Value.(type) { + case *commonv1.AnyValue_StringValue: + return typed.StringValue + case *commonv1.AnyValue_BoolValue: + return typed.BoolValue + case *commonv1.AnyValue_IntValue: + return typed.IntValue + case *commonv1.AnyValue_DoubleValue: + return typed.DoubleValue + case *commonv1.AnyValue_ArrayValue: + out := make([]any, 0, len(typed.ArrayValue.GetValues())) + for _, item := range typed.ArrayValue.GetValues() { + out = append(out, anyValue(item)) + } + return out + case *commonv1.AnyValue_KvlistValue: + out := map[string]any{} + for _, item := range typed.KvlistValue.GetValues() { + out[item.GetKey()] = anyValue(item.GetValue()) + } + return out + case *commonv1.AnyValue_BytesValue: + return typed.BytesValue + default: + return nil + } +} + +func readBody(r *http.Request, maxBytes int64) ([]byte, error) { + var reader io.Reader = r.Body + if strings.EqualFold(strings.TrimSpace(r.Header.Get("Content-Encoding")), "gzip") { + gz, err := gzip.NewReader(r.Body) + if err != nil { + return nil, fmt.Errorf("open gzip body: %w", err) + } + defer gz.Close() + reader = gz + } + if maxBytes > 0 { + reader = io.LimitReader(reader, maxBytes+1) + } + body, err := io.ReadAll(reader) + if err != nil { + return nil, fmt.Errorf("read request body: %w", err) + } + if maxBytes > 0 && int64(len(body)) > maxBytes { + return nil, fmt.Errorf("request body exceeds %d bytes", maxBytes) + } + return body, nil +} + +func isJSONContentType(contentType string) bool { + contentType = strings.ToLower(strings.TrimSpace(contentType)) + return strings.Contains(contentType, "json") +} + +func traceIDHex(raw []byte) string { + if len(raw) == 0 { + return "" + } + return fmt.Sprintf("%x", raw) +} + +func mapSpanKind(kind tracev1.Span_SpanKind) string { + switch kind { + case tracev1.Span_SPAN_KIND_SERVER: + return "server" + case tracev1.Span_SPAN_KIND_CLIENT: + return "client" + case tracev1.Span_SPAN_KIND_PRODUCER: + return "producer" + case tracev1.Span_SPAN_KIND_CONSUMER: + return "consumer" + default: + return "internal" + } +} diff --git a/internal/connectors/otlp/http_test.go b/internal/connectors/otlp/http_test.go new file mode 100644 index 0000000..0b81a89 --- /dev/null +++ b/internal/connectors/otlp/http_test.go @@ -0,0 +1,108 @@ +package otlp + +import ( + "bytes" + "compress/gzip" + "net/http/httptest" + "testing" + "time" + + collecttracev1 "go.opentelemetry.io/proto/otlp/collector/trace/v1" + commonv1 "go.opentelemetry.io/proto/otlp/common/v1" + resourcev1 "go.opentelemetry.io/proto/otlp/resource/v1" + tracev1 "go.opentelemetry.io/proto/otlp/trace/v1" + "google.golang.org/protobuf/encoding/protojson" + "google.golang.org/protobuf/proto" +) + +func TestDecodePayload_Protobuf(t *testing.T) { + t.Parallel() + + req := sampleRequest(time.Date(2026, 3, 11, 12, 0, 0, 0, time.UTC)) + raw, err := proto.Marshal(req) + if err != nil { + t.Fatalf("marshal protobuf request: %v", err) + } + + spans, err := DecodePayload("application/x-protobuf", raw) + if err != nil { + t.Fatalf("DecodePayload returned error: %v", err) + } + if got, want := len(spans), 1; got != want { + t.Fatalf("span count mismatch: got=%d want=%d", got, want) + } + if spans[0].Service != "frontend" { + t.Fatalf("service mismatch: %s", spans[0].Service) + } + if spans[0].Kind != "server" { + t.Fatalf("kind mismatch: %s", spans[0].Kind) + } + if spans[0].StartTime.IsZero() || spans[0].EndTime.IsZero() { + t.Fatal("expected timestamps to be populated") + } +} + +func TestDecodeHTTPRequest_JSONGzip(t *testing.T) { + t.Parallel() + + req := sampleRequest(time.Date(2026, 3, 11, 12, 0, 0, 0, time.UTC)) + raw, err := protojson.Marshal(req) + if err != nil { + t.Fatalf("marshal json request: %v", err) + } + var buf bytes.Buffer + gz := gzipWriter(t, &buf) + if _, err := gz.Write(raw); err != nil { + t.Fatalf("gzip write: %v", err) + } + if err := gz.Close(); err != nil { + t.Fatalf("gzip close: %v", err) + } + + httpReq := httptest.NewRequest("POST", "/v1/traces", &buf) + httpReq.Header.Set("Content-Type", "application/json") + httpReq.Header.Set("Content-Encoding", "gzip") + + spans, err := DecodeHTTPRequest(httpReq, 1<<20) + if err != nil { + t.Fatalf("DecodeHTTPRequest returned error: %v", err) + } + if got, want := len(spans), 1; got != want { + t.Fatalf("span count mismatch: got=%d want=%d", got, want) + } + if spans[0].Attributes["http.route"] != "/checkout" { + t.Fatalf("route attribute mismatch: %+v", spans[0].Attributes) + } +} + +func sampleRequest(start time.Time) *collecttracev1.ExportTraceServiceRequest { + end := start.Add(50 * time.Millisecond) + return &collecttracev1.ExportTraceServiceRequest{ + ResourceSpans: []*tracev1.ResourceSpans{{ + Resource: &resourcev1.Resource{Attributes: []*commonv1.KeyValue{{ + Key: "service.name", + Value: &commonv1.AnyValue{Value: &commonv1.AnyValue_StringValue{StringValue: "frontend"}}, + }}}, + ScopeSpans: []*tracev1.ScopeSpans{{ + Spans: []*tracev1.Span{{ + TraceId: []byte{0xaa, 0xbb}, + SpanId: []byte{0x01}, + Name: "GET /checkout", + Kind: tracev1.Span_SPAN_KIND_SERVER, + StartTimeUnixNano: uint64(start.UnixNano()), + EndTimeUnixNano: uint64(end.UnixNano()), + Attributes: []*commonv1.KeyValue{ + {Key: "http.request.method", Value: &commonv1.AnyValue{Value: &commonv1.AnyValue_StringValue{StringValue: "GET"}}}, + {Key: "http.route", Value: &commonv1.AnyValue{Value: &commonv1.AnyValue_StringValue{StringValue: "/checkout"}}}, + }, + }}, + }}, + }}, + } +} + +func gzipWriter(t *testing.T, buf *bytes.Buffer) *gzip.Writer { + t.Helper() + gz := gzip.NewWriter(buf) + return gz +} diff --git a/internal/connectors/traces/loader.go b/internal/connectors/traces/loader.go index ff306ee..868f0c7 100644 --- a/internal/connectors/traces/loader.go +++ b/internal/connectors/traces/loader.go @@ -10,6 +10,7 @@ import ( "sort" "strconv" "strings" + "time" "gopkg.in/yaml.v3" ) @@ -21,9 +22,21 @@ type Span struct { Service string Name string Kind string + StartTime time.Time + EndTime time.Time Attributes map[string]any } +func (s Span) EventTime(fallback time.Time) time.Time { + if !s.EndTime.IsZero() { + return s.EndTime.UTC() + } + if !s.StartTime.IsZero() { + return s.StartTime.UTC() + } + return fallback.UTC() +} + func Load(inputPath string) ([]Span, error) { paths, err := collectInputFiles(inputPath) if err != nil { @@ -150,6 +163,8 @@ func parseNormalized(doc any) ([]Span, bool) { Service: firstNonEmptyString(obj["service"], obj["service_name"], obj["service.name"]), Name: firstNonEmptyString(obj["name"]), Kind: strings.ToLower(strings.TrimSpace(firstNonEmptyString(obj["kind"]))), + StartTime: parseTimeValue(obj["start_time"], obj["startTime"]), + EndTime: parseTimeValue(obj["end_time"], obj["endTime"]), Attributes: attrs, }) } @@ -223,6 +238,8 @@ func parseOTLP(doc any) ([]Span, bool) { Service: service, Name: firstNonEmptyString(spanObj["name"]), Kind: kind, + StartTime: parseTimeValue(spanObj["startTimeUnixNano"], spanObj["start_time_unix_nano"]), + EndTime: parseTimeValue(spanObj["endTimeUnixNano"], spanObj["end_time_unix_nano"]), Attributes: mergedAttrs, }) } @@ -393,3 +410,31 @@ func firstNonEmptyString(values ...any) string { } return "" } + +func parseTimeValue(values ...any) time.Time { + for _, value := range values { + switch typed := value.(type) { + case string: + if strings.TrimSpace(typed) == "" { + continue + } + if parsed, err := time.Parse(time.RFC3339Nano, strings.TrimSpace(typed)); err == nil { + return parsed.UTC() + } + if n, err := strconv.ParseInt(strings.TrimSpace(typed), 10, 64); err == nil { + return time.Unix(0, n).UTC() + } + case json.Number: + if n, err := typed.Int64(); err == nil { + return time.Unix(0, n).UTC() + } + case float64: + return time.Unix(0, int64(typed)).UTC() + case int64: + return time.Unix(0, typed).UTC() + case int: + return time.Unix(0, int64(typed)).UTC() + } + } + return time.Time{} +} diff --git a/internal/discovery/discovery.go b/internal/discovery/discovery.go index 25140b4..e8add12 100644 --- a/internal/discovery/discovery.go +++ b/internal/discovery/discovery.go @@ -11,7 +11,9 @@ import ( "github.com/MB3R-Lab/Bering/internal/connectors/traces" "github.com/MB3R-Lab/Bering/internal/model" + "github.com/MB3R-Lab/Bering/internal/overlay" "github.com/MB3R-Lab/Bering/internal/schema" + "github.com/MB3R-Lab/Bering/internal/snapshot" ) const SourceTypeBering = "bering" @@ -20,11 +22,31 @@ type Options struct { SourceRef string DiscoveredAt string ReplicasOverride map[string]int + Overlays []overlay.File + Sources []snapshot.SourceSummary + RuntimeMode bool +} + +type Result struct { + Model model.ResilienceModel + Discovery snapshot.DiscoveryDetails + Sources []snapshot.SourceSummary + Coverage snapshot.CoverageSummary + SpanCount int + TraceCount int } func Build(spans []traces.Span, opts Options) (model.ResilienceModel, error) { + result, err := Discover(spans, opts) + if err != nil { + return model.ResilienceModel{}, err + } + return result.Model, nil +} + +func Discover(spans []traces.Span, opts Options) (Result, error) { if len(spans) == 0 { - return model.ResilienceModel{}, fmt.Errorf("no spans provided for discovery") + return Result{}, fmt.Errorf("no spans provided for discovery") } discoveredAt := opts.DiscoveredAt @@ -32,7 +54,7 @@ func Build(spans []traces.Span, opts Options) (model.ResilienceModel, error) { discoveredAt = time.Now().UTC().Format(time.RFC3339) } if _, err := time.Parse(time.RFC3339, discoveredAt); err != nil { - return model.ResilienceModel{}, fmt.Errorf("discovered_at must be RFC3339: %w", err) + return Result{}, fmt.Errorf("discovered_at must be RFC3339: %w", err) } sourceRef := strings.TrimSpace(opts.SourceRef) @@ -40,8 +62,17 @@ func Build(spans []traces.Span, opts Options) (model.ResilienceModel, error) { sourceRef = "bering://discover" } + allOverlays := make([]overlay.File, 0, len(opts.Overlays)+1) + if len(opts.ReplicasOverride) > 0 { + allOverlays = append(allOverlays, overlay.FromReplicas(opts.ReplicasOverride, "legacy:replicas")) + } + allOverlays = append(allOverlays, opts.Overlays...) + spanByKey := map[string]traces.Span{} serviceSet := map[string]int{} + serviceStats := map[string]*serviceAccumulator{} + traceSet := map[string]struct{}{} + for _, span := range spans { service := strings.TrimSpace(span.Service) if service == "" { @@ -53,21 +84,22 @@ func Build(spans []traces.Span, opts Options) (model.ResilienceModel, error) { if span.TraceID != "" && span.SpanID != "" { spanByKey[traceSpanKey(span.TraceID, span.SpanID)] = span } + if span.TraceID != "" { + traceSet[span.TraceID] = struct{}{} + } + serviceStats[service] = updateServiceAccumulator(serviceStats[service], span, opts.RuntimeMode) } if len(serviceSet) == 0 { - return model.ResilienceModel{}, fmt.Errorf("discovery produced no services") + return Result{}, fmt.Errorf("discovery produced no services") } - for serviceID, replicas := range opts.ReplicasOverride { - if _, exists := serviceSet[serviceID]; !exists { - return model.ResilienceModel{}, fmt.Errorf("replicas override references unknown service %q", serviceID) - } - serviceSet[serviceID] = replicas + if err := applyReplicaOverlays(serviceSet, allOverlays); err != nil { + return Result{}, err } - edgeSet := map[string]model.Edge{} - endpointSet := map[string]model.Endpoint{} + edgeStats := map[string]*edgeAccumulator{} + endpointStats := map[string]*endpointAccumulator{} linkedCrossServiceEdges := 0 for _, span := range spans { @@ -77,38 +109,51 @@ func Build(spans []traces.Span, opts Options) (model.ResilienceModel, error) { parent, hasParent := parentSpan(span, spanByKey) if hasParent && strings.TrimSpace(parent.Service) != "" && parent.Service != span.Service { - kind := edgeKind(parent, span) + kind, evidence := edgeKind(parent, span) blocking := kind == model.EdgeKindSync key := edgeKey(parent.Service, span.Service, kind, blocking) - if _, exists := edgeSet[key]; !exists { - edgeSet[key] = model.Edge{ - From: parent.Service, - To: span.Service, - Kind: kind, - Blocking: blocking, - } - } + edgeStats[key] = updateEdgeAccumulator(edgeStats[key], key, parent, span, kind, blocking, evidence, opts.RuntimeMode) linkedCrossServiceEdges++ } if endpoint, ok := inferEndpoint(span, parent, hasParent); ok { - endpointSet[endpoint.ID] = endpoint + endpointStats[endpoint.ID] = updateEndpointAccumulator(endpointStats[endpoint.ID], endpoint, span, opts.RuntimeMode) } } services := make([]model.Service, 0, len(serviceSet)) + serviceRecords := make([]snapshot.ServiceRecord, 0, len(serviceStats)) for id, replicas := range serviceSet { - services = append(services, model.Service{ - ID: id, - Name: id, - Replicas: replicas, + services = append(services, model.Service{ID: id, Name: id, Replicas: replicas}) + stat := serviceStats[id] + serviceRecords = append(serviceRecords, snapshot.ServiceRecord{ + ID: id, + Name: id, + Replicas: replicas, + Support: stat.supportSummary(), + FirstSeen: formatOptionalTime(stat.firstSeen, opts.RuntimeMode), + LastSeen: formatOptionalTime(stat.lastSeen, opts.RuntimeMode), + Provenance: []snapshot.Provenance{{Type: "traces", Connector: inferConnector(opts.Sources), Ref: sourceRef}}, }) } sort.Slice(services, func(i, j int) bool { return services[i].ID < services[j].ID }) - - edges := make([]model.Edge, 0, len(edgeSet)) - for _, edge := range edgeSet { - edges = append(edges, edge) + sort.Slice(serviceRecords, func(i, j int) bool { return serviceRecords[i].ID < serviceRecords[j].ID }) + + edges := make([]model.Edge, 0, len(edgeStats)) + edgeRecords := make([]snapshot.EdgeRecord, 0, len(edgeStats)) + for _, stat := range edgeStats { + edges = append(edges, stat.edge) + edgeRecords = append(edgeRecords, snapshot.EdgeRecord{ + ID: stat.id, + From: stat.edge.From, + To: stat.edge.To, + Kind: stat.edge.Kind, + Blocking: stat.edge.Blocking, + Support: stat.supportSummary(), + FirstSeen: formatOptionalTime(stat.firstSeen, opts.RuntimeMode), + LastSeen: formatOptionalTime(stat.lastSeen, opts.RuntimeMode), + Provenance: []snapshot.Provenance{{Type: "traces", Connector: inferConnector(opts.Sources), Ref: sourceRef}}, + }) } sort.Slice(edges, func(i, j int) bool { left, right := edges[i], edges[j] @@ -123,12 +168,25 @@ func Build(spans []traces.Span, opts Options) (model.ResilienceModel, error) { } return !left.Blocking && right.Blocking }) - - endpoints := make([]model.Endpoint, 0, len(endpointSet)) - for _, endpoint := range endpointSet { - endpoints = append(endpoints, endpoint) + sort.Slice(edgeRecords, func(i, j int) bool { return edgeRecords[i].ID < edgeRecords[j].ID }) + + endpoints := make([]model.Endpoint, 0, len(endpointStats)) + endpointRecords := make([]snapshot.EndpointRecord, 0, len(endpointStats)) + for _, stat := range endpointStats { + endpoints = append(endpoints, stat.endpoint) + endpointRecords = append(endpointRecords, snapshot.EndpointRecord{ + ID: stat.endpoint.ID, + EntryService: stat.endpoint.EntryService, + Method: stat.method, + Path: stat.path, + Support: stat.supportSummary(), + FirstSeen: formatOptionalTime(stat.firstSeen, opts.RuntimeMode), + LastSeen: formatOptionalTime(stat.lastSeen, opts.RuntimeMode), + Provenance: []snapshot.Provenance{{Type: "traces", Connector: inferConnector(opts.Sources), Ref: sourceRef}}, + }) } sort.Slice(endpoints, func(i, j int) bool { return endpoints[i].ID < endpoints[j].ID }) + sort.Slice(endpointRecords, func(i, j int) bool { return endpointRecords[i].ID < endpointRecords[j].ID }) confidence := calculateConfidence(len(spans), len(services), len(edges), len(endpoints), linkedCrossServiceEdges) mdl := model.ResilienceModel{ @@ -149,10 +207,38 @@ func Build(spans []traces.Span, opts Options) (model.ResilienceModel, error) { }, } if err := mdl.ValidateSemantic(); err != nil { - return model.ResilienceModel{}, err + return Result{}, err + } + mdl.SortDeterministic() + + applications, err := applyOverlays(&mdl, serviceRecords, edgeRecords, endpointRecords, allOverlays) + if err != nil { + return Result{}, err } mdl.SortDeterministic() - return mdl, nil + serviceRecords = rebuildServiceRecords(serviceRecords, mdl.Services) + + coverage := snapshot.CoverageSummary{ + Confidence: confidence, + ServiceSupportMin: minSupportServices(serviceRecords), + EdgeSupportMin: minSupportEdges(edgeRecords), + EndpointSupportMin: minSupportEndpoints(endpointRecords), + } + + result := Result{ + Model: mdl, + Discovery: snapshot.DiscoveryDetails{ + Services: serviceRecords, + Edges: edgeRecords, + Endpoints: endpointRecords, + Overlays: applications, + }, + Sources: defaultSources(opts.Sources, sourceRef, len(spans)), + Coverage: coverage, + SpanCount: len(spans), + TraceCount: len(traceSet), + } + return result, nil } func BuildSourceRef(input string) string { @@ -164,6 +250,13 @@ func BuildSourceRef(input string) string { return "bering://discover?input=" + url.QueryEscape(clean) } +func BuildServeSourceRef(listenAddr string) string { + if strings.TrimSpace(listenAddr) == "" { + return "bering://serve" + } + return "bering://serve?listen=" + url.QueryEscape(strings.TrimSpace(listenAddr)) +} + func parentSpan(child traces.Span, index map[string]traces.Span) (traces.Span, bool) { if strings.TrimSpace(child.TraceID) == "" || strings.TrimSpace(child.ParentSpanID) == "" { return traces.Span{}, false @@ -176,11 +269,17 @@ func traceSpanKey(traceID, spanID string) string { return strings.TrimSpace(traceID) + "|" + strings.TrimSpace(spanID) } -func edgeKind(parent, child traces.Span) model.EdgeKind { - if isAsyncSpan(parent) || isAsyncSpan(child) { - return model.EdgeKindAsync +func edgeKind(parent, child traces.Span) (model.EdgeKind, []string) { + evidence := []string{"default_sync"} + if isAsyncSpan(parent) { + evidence = append([]string{"parent_async_signal"}, asyncEvidence(parent)...) + return model.EdgeKindAsync, dedupeStrings(evidence) + } + if isAsyncSpan(child) { + evidence = append([]string{"child_async_signal"}, asyncEvidence(child)...) + return model.EdgeKindAsync, dedupeStrings(evidence) } - return model.EdgeKindSync + return model.EdgeKindSync, evidence } func isAsyncSpan(span traces.Span) bool { @@ -200,10 +299,33 @@ func isAsyncSpan(span traces.Span) bool { return false } +func asyncEvidence(span traces.Span) []string { + out := []string{} + kind := strings.ToLower(strings.TrimSpace(span.Kind)) + if kind == "producer" || kind == "consumer" { + out = append(out, "span.kind="+kind) + } + for _, key := range []string{"messaging.system", "messaging.destination", "messaging.operation"} { + if attrString(span.Attributes, key) != "" { + out = append(out, key) + } + } + return out +} + func inferEndpoint(span, parent traces.Span, hasParent bool) (model.Endpoint, bool) { - if strings.TrimSpace(span.Service) == "" { + method, path, ok := inferEndpointDetails(span, parent, hasParent) + if !ok { return model.Endpoint{}, false } + id := fmt.Sprintf("%s:%s %s", span.Service, method, path) + return model.Endpoint{ID: id, EntryService: span.Service, SuccessPredicateRef: id}, true +} + +func inferEndpointDetails(span, parent traces.Span, hasParent bool) (string, string, bool) { + if strings.TrimSpace(span.Service) == "" { + return "", "", false + } method := strings.ToUpper(strings.TrimSpace(firstAttr( span.Attributes, @@ -214,7 +336,7 @@ func inferEndpoint(span, parent traces.Span, hasParent bool) (model.Endpoint, bo method, _ = parseMethodAndPathFromSpanName(span.Name) } if method == "" { - return model.Endpoint{}, false + return "", "", false } path := strings.TrimSpace(firstAttr( @@ -228,23 +350,17 @@ func inferEndpoint(span, parent traces.Span, hasParent bool) (model.Endpoint, bo } path = normalizePath(path) if path == "" { - return model.Endpoint{}, false + return "", "", false } isServer := strings.EqualFold(span.Kind, "server") if !isServer { - // If span isn't marked as server, only keep it as entry when it has no same-service parent. if hasParent && parent.Service == span.Service { - return model.Endpoint{}, false + return "", "", false } } - id := fmt.Sprintf("%s:%s %s", span.Service, method, path) - return model.Endpoint{ - ID: id, - EntryService: span.Service, - SuccessPredicateRef: id, - }, true + return method, path, true } func calculateConfidence(totalSpans, serviceCount, edgeCount, endpointCount, linkedCrossEdges int) float64 { @@ -264,7 +380,6 @@ func calculateConfidence(totalSpans, serviceCount, edgeCount, endpointCount, lin if score > 1 { score = 1 } - // Keep two decimals stable for deterministic output readability. return math.Round(score*100) / 100 } @@ -341,3 +456,363 @@ func normalizePath(path string) string { } return path } + +type serviceAccumulator struct { + observations int + traceIDs map[string]struct{} + firstSeen time.Time + lastSeen time.Time +} + +func updateServiceAccumulator(acc *serviceAccumulator, span traces.Span, runtimeMode bool) *serviceAccumulator { + if acc == nil { + acc = &serviceAccumulator{traceIDs: map[string]struct{}{}} + } + acc.observations++ + if span.TraceID != "" { + acc.traceIDs[span.TraceID] = struct{}{} + } + acc.observeTime(span.EventTime(time.Time{}), runtimeMode) + return acc +} + +func (a *serviceAccumulator) observeTime(ts time.Time, runtimeMode bool) { + if !runtimeMode || ts.IsZero() { + return + } + if a.firstSeen.IsZero() || ts.Before(a.firstSeen) { + a.firstSeen = ts + } + if a.lastSeen.IsZero() || ts.After(a.lastSeen) { + a.lastSeen = ts + } +} + +func (a *serviceAccumulator) supportSummary() snapshot.SupportSummary { + return snapshot.SupportSummary{Observations: a.observations, TraceCount: len(a.traceIDs)} +} + +type edgeAccumulator struct { + id string + edge model.Edge + observations int + traceIDs map[string]struct{} + evidence map[string]struct{} + firstSeen time.Time + lastSeen time.Time +} + +func updateEdgeAccumulator(acc *edgeAccumulator, id string, parent, child traces.Span, kind model.EdgeKind, blocking bool, evidence []string, runtimeMode bool) *edgeAccumulator { + if acc == nil { + acc = &edgeAccumulator{ + id: id, + edge: model.Edge{From: parent.Service, To: child.Service, Kind: kind, Blocking: blocking}, + traceIDs: map[string]struct{}{}, + evidence: map[string]struct{}{}, + } + } + acc.observations++ + if child.TraceID != "" { + acc.traceIDs[child.TraceID] = struct{}{} + } + for _, item := range evidence { + if strings.TrimSpace(item) != "" { + acc.evidence[item] = struct{}{} + } + } + acc.observeTime(child.EventTime(time.Time{}), runtimeMode) + return acc +} + +func (a *edgeAccumulator) observeTime(ts time.Time, runtimeMode bool) { + if !runtimeMode || ts.IsZero() { + return + } + if a.firstSeen.IsZero() || ts.Before(a.firstSeen) { + a.firstSeen = ts + } + if a.lastSeen.IsZero() || ts.After(a.lastSeen) { + a.lastSeen = ts + } +} + +func (a *edgeAccumulator) supportSummary() snapshot.SupportSummary { + return snapshot.SupportSummary{Observations: a.observations, TraceCount: len(a.traceIDs), Evidence: sortStringSet(a.evidence)} +} + +type endpointAccumulator struct { + endpoint model.Endpoint + method string + path string + observations int + traceIDs map[string]struct{} + firstSeen time.Time + lastSeen time.Time +} + +func updateEndpointAccumulator(acc *endpointAccumulator, endpoint model.Endpoint, span traces.Span, runtimeMode bool) *endpointAccumulator { + method, path, _ := inferEndpointDetails(span, traces.Span{}, false) + if acc == nil { + acc = &endpointAccumulator{endpoint: endpoint, method: method, path: path, traceIDs: map[string]struct{}{}} + } + acc.observations++ + if span.TraceID != "" { + acc.traceIDs[span.TraceID] = struct{}{} + } + acc.observeTime(span.EventTime(time.Time{}), runtimeMode) + return acc +} + +func (a *endpointAccumulator) observeTime(ts time.Time, runtimeMode bool) { + if !runtimeMode || ts.IsZero() { + return + } + if a.firstSeen.IsZero() || ts.Before(a.firstSeen) { + a.firstSeen = ts + } + if a.lastSeen.IsZero() || ts.After(a.lastSeen) { + a.lastSeen = ts + } +} + +func (a *endpointAccumulator) supportSummary() snapshot.SupportSummary { + return snapshot.SupportSummary{Observations: a.observations, TraceCount: len(a.traceIDs)} +} + +func applyReplicaOverlays(serviceSet map[string]int, overlays []overlay.File) error { + for _, file := range overlays { + for _, item := range file.Services { + if item.Replicas == nil { + continue + } + if _, exists := serviceSet[item.ID]; !exists { + return fmt.Errorf("overlay %q references unknown service %q", file.Name, item.ID) + } + serviceSet[item.ID] = *item.Replicas + } + } + return nil +} + +func applyOverlays(mdl *model.ResilienceModel, services []snapshot.ServiceRecord, edges []snapshot.EdgeRecord, endpoints []snapshot.EndpointRecord, overlays []overlay.File) ([]snapshot.OverlayApplication, error) { + serviceIndex := make(map[string]int, len(services)) + serviceModelIndex := make(map[string]int, len(mdl.Services)) + edgeIndex := make(map[string]int, len(edges)) + endpointIndex := make(map[string]int, len(endpoints)) + for i, item := range services { + serviceIndex[item.ID] = i + } + for i, item := range mdl.Services { + serviceModelIndex[item.ID] = i + } + for i, item := range edges { + edgeIndex[item.ID] = i + } + for i, item := range endpoints { + endpointIndex[item.ID] = i + } + applications := make([]snapshot.OverlayApplication, 0, len(overlays)) + for i, file := range overlays { + precedence := i + 1 + applications = append(applications, snapshot.OverlayApplication{Name: file.Name, Ref: file.Ref, Precedence: precedence}) + prov := snapshot.Provenance{Type: "overlay", Name: file.Name, Ref: file.Ref, Precedence: precedence} + for _, item := range file.Services { + index, ok := serviceIndex[item.ID] + if !ok { + return nil, fmt.Errorf("overlay %q references unknown service %q", file.Name, item.ID) + } + mergeServiceMetadata(&services[index], item) + services[index].Provenance = append(services[index].Provenance, prov) + if item.Replicas != nil { + services[index].Replicas = *item.Replicas + if modelIndex, ok := serviceModelIndex[item.ID]; ok { + mdl.Services[modelIndex].Replicas = *item.Replicas + } + } + } + for _, item := range file.Edges { + index, ok := edgeIndex[item.ID] + if !ok { + return nil, fmt.Errorf("overlay %q references unknown edge %q", file.Name, item.ID) + } + mergeEdgeMetadata(&edges[index], item) + edges[index].Provenance = append(edges[index].Provenance, prov) + } + for _, item := range file.Endpoints { + index, ok := endpointIndex[item.ID] + if !ok { + return nil, fmt.Errorf("overlay %q references unknown endpoint %q", file.Name, item.ID) + } + mergeEndpointMetadata(&endpoints[index], item) + endpoints[index].Provenance = append(endpoints[index].Provenance, prov) + if strings.TrimSpace(item.PredicateRef) != "" { + for endpointIdx := range mdl.Endpoints { + if mdl.Endpoints[endpointIdx].ID == item.ID { + mdl.Endpoints[endpointIdx].SuccessPredicateRef = item.PredicateRef + break + } + } + } + } + } + return applications, nil +} + +func mergeServiceMetadata(target *snapshot.ServiceRecord, item overlay.ServiceOverlay) { + mergeCommonMetadata(&target.Metadata.CommonMetadata, item.CommonMetadata) + mergeAttributes(&target.Metadata.Attributes, item.Attributes) + if item.FailureEligible != nil { + target.Metadata.FailureEligible = item.FailureEligible + } + if item.Replicas != nil { + target.Metadata.ReplicasOverride = item.Replicas + } +} + +func mergeEdgeMetadata(target *snapshot.EdgeRecord, item overlay.EdgeOverlay) { + mergeCommonMetadata(&target.Metadata.CommonMetadata, item.CommonMetadata) + mergeAttributes(&target.Metadata.Attributes, item.Attributes) + if item.Weight != nil { + target.Metadata.Weight = item.Weight + } +} + +func mergeEndpointMetadata(target *snapshot.EndpointRecord, item overlay.EndpointOverlay) { + mergeCommonMetadata(&target.Metadata.CommonMetadata, item.CommonMetadata) + mergeAttributes(&target.Metadata.Attributes, item.Attributes) + if item.Weight != nil { + target.Metadata.Weight = item.Weight + } + if strings.TrimSpace(item.PredicateRef) != "" { + target.Metadata.PredicateRef = strings.TrimSpace(item.PredicateRef) + } + if strings.TrimSpace(item.Method) != "" { + target.Method = strings.ToUpper(strings.TrimSpace(item.Method)) + } + if strings.TrimSpace(item.Path) != "" { + target.Path = normalizePath(item.Path) + } +} + +func mergeCommonMetadata(target *snapshot.CommonMetadata, source overlay.CommonMetadata) { + if len(source.Labels) > 0 { + if target.Labels == nil { + target.Labels = map[string]string{} + } + for key, value := range source.Labels { + target.Labels[key] = value + } + } + if len(source.Tags) > 0 { + target.Tags = dedupeStrings(append(target.Tags, source.Tags...)) + } + if len(source.SLORefs) > 0 { + target.SLORefs = dedupeStrings(append(target.SLORefs, source.SLORefs...)) + } +} + +func mergeAttributes(target *map[string]string, source map[string]string) { + if len(source) == 0 { + return + } + if *target == nil { + *target = map[string]string{} + } + for key, value := range source { + (*target)[key] = value + } +} + +func rebuildServiceRecords(records []snapshot.ServiceRecord, services []model.Service) []snapshot.ServiceRecord { + index := make(map[string]model.Service, len(services)) + for _, item := range services { + index[item.ID] = item + } + for i := range records { + if svc, ok := index[records[i].ID]; ok { + records[i].Replicas = svc.Replicas + records[i].Name = svc.Name + } + } + return records +} + +func minSupportServices(items []snapshot.ServiceRecord) int { + return minSupport(len(items), func(i int) int { return items[i].Support.Observations }) +} + +func minSupportEdges(items []snapshot.EdgeRecord) int { + return minSupport(len(items), func(i int) int { return items[i].Support.Observations }) +} + +func minSupportEndpoints(items []snapshot.EndpointRecord) int { + return minSupport(len(items), func(i int) int { return items[i].Support.Observations }) +} + +func minSupport(length int, value func(int) int) int { + if length == 0 { + return 0 + } + min := value(0) + for i := 1; i < length; i++ { + if current := value(i); current < min { + min = current + } + } + return min +} + +func formatOptionalTime(ts time.Time, include bool) string { + if !include || ts.IsZero() { + return "" + } + return ts.UTC().Format(time.RFC3339) +} + +func sortStringSet(values map[string]struct{}) []string { + out := make([]string, 0, len(values)) + for value := range values { + out = append(out, value) + } + sort.Strings(out) + return out +} + +func dedupeStrings(values []string) []string { + set := map[string]struct{}{} + out := make([]string, 0, len(values)) + for _, value := range values { + trimmed := strings.TrimSpace(value) + if trimmed == "" { + continue + } + if _, exists := set[trimmed]; exists { + continue + } + set[trimmed] = struct{}{} + out = append(out, trimmed) + } + sort.Strings(out) + return out +} + +func inferConnector(sources []snapshot.SourceSummary) string { + if len(sources) == 0 { + return "trace_file" + } + return strings.TrimSpace(sources[0].Connector) +} + +func defaultSources(sources []snapshot.SourceSummary, ref string, observations int) []snapshot.SourceSummary { + if len(sources) > 0 { + out := append([]snapshot.SourceSummary(nil), sources...) + if out[0].Ref == "" { + out[0].Ref = ref + } + if out[0].Observations == 0 { + out[0].Observations = observations + } + return out + } + return []snapshot.SourceSummary{{Type: "traces", Connector: "trace_file", Ref: ref, Observations: observations}} +} diff --git a/internal/discovery/discovery_test.go b/internal/discovery/discovery_test.go index 67a0d2e..98f3088 100644 --- a/internal/discovery/discovery_test.go +++ b/internal/discovery/discovery_test.go @@ -2,9 +2,11 @@ package discovery import ( "testing" + "time" "github.com/MB3R-Lab/Bering/internal/connectors/traces" "github.com/MB3R-Lab/Bering/internal/model" + "github.com/MB3R-Lab/Bering/internal/overlay" "github.com/MB3R-Lab/Bering/internal/schema" ) @@ -81,3 +83,55 @@ func TestBuild_UnknownReplicaOverrideFails(t *testing.T) { t.Fatal("expected unknown override service error, got nil") } } + +func TestDiscover_OverlayAppliesRuntimeMetadata(t *testing.T) { + t.Parallel() + + now := time.Date(2026, 3, 11, 12, 0, 0, 0, time.UTC) + replicas := 3 + weight := 0.7 + spans := []traces.Span{ + { + TraceID: "t1", SpanID: "1", Service: "checkout", Name: "POST /process", Kind: "server", + StartTime: now, EndTime: now.Add(50 * time.Millisecond), + Attributes: map[string]any{"http.request.method": "POST", "http.route": "/process"}, + }, + } + + result, err := Discover(spans, Options{ + SourceRef: BuildSourceRef("examples/traces/normalized.sample.json"), + DiscoveredAt: "2026-03-03T00:00:00Z", + RuntimeMode: true, + Overlays: []overlay.File{{ + Name: "test", + Services: []overlay.ServiceOverlay{{ID: "checkout", Replicas: &replicas, FailureEligible: boolPtr(true), CommonMetadata: overlay.CommonMetadata{Attributes: map[string]string{"team": "commerce"}}}}, + Endpoints: []overlay.EndpointOverlay{{ID: "checkout:POST /process", PredicateRef: "catalog.checkout.success", Weight: &weight, CommonMetadata: overlay.CommonMetadata{Attributes: map[string]string{"verb": "write"}}}}, + }}, + }) + if err != nil { + t.Fatalf("Discover returned error: %v", err) + } + + if got := result.Model.Services[0].Replicas; got != 3 { + t.Fatalf("replicas override mismatch: got=%d", got) + } + if got := result.Model.Endpoints[0].SuccessPredicateRef; got != "catalog.checkout.success" { + t.Fatalf("predicate ref mismatch: got=%s", got) + } + if got := result.Discovery.Services[0].FirstSeen; got == "" { + t.Fatal("expected runtime first_seen to be populated") + } + if got := result.Discovery.Endpoints[0].Metadata.PredicateRef; got != "catalog.checkout.success" { + t.Fatalf("endpoint metadata predicate mismatch: got=%s", got) + } + if got := result.Discovery.Services[0].Metadata.Attributes["team"]; got != "commerce" { + t.Fatalf("service metadata attribute mismatch: got=%s", got) + } + if got := result.Discovery.Endpoints[0].Metadata.Attributes["verb"]; got != "write" { + t.Fatalf("endpoint metadata attribute mismatch: got=%s", got) + } +} + +func boolPtr(v bool) *bool { + return &v +} diff --git a/internal/overlay/load.go b/internal/overlay/load.go new file mode 100644 index 0000000..b3fe623 --- /dev/null +++ b/internal/overlay/load.go @@ -0,0 +1,50 @@ +package overlay + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + + "gopkg.in/yaml.v3" +) + +func LoadFile(path string) (File, error) { + raw, err := os.ReadFile(path) + if err != nil { + return File{}, fmt.Errorf("read overlay file: %w", err) + } + var out File + ext := strings.ToLower(filepath.Ext(path)) + switch ext { + case ".json": + if err := json.Unmarshal(raw, &out); err != nil { + return File{}, fmt.Errorf("decode overlay json: %w", err) + } + default: + if err := yaml.Unmarshal(raw, &out); err != nil { + return File{}, fmt.Errorf("decode overlay yaml: %w", err) + } + } + if err := out.Normalize(path); err != nil { + return File{}, err + } + return out, nil +} + +func LoadFiles(paths []string) ([]File, error) { + out := make([]File, 0, len(paths)) + for _, path := range paths { + trimmed := strings.TrimSpace(path) + if trimmed == "" { + continue + } + item, err := LoadFile(trimmed) + if err != nil { + return nil, fmt.Errorf("load overlay %q: %w", trimmed, err) + } + out = append(out, item) + } + return out, nil +} diff --git a/internal/overlay/overlay.go b/internal/overlay/overlay.go new file mode 100644 index 0000000..39a0c0e --- /dev/null +++ b/internal/overlay/overlay.go @@ -0,0 +1,111 @@ +package overlay + +import ( + "fmt" + "path/filepath" + "strings" +) + +type File struct { + Name string `json:"name" yaml:"name"` + Services []ServiceOverlay `json:"services" yaml:"services"` + Edges []EdgeOverlay `json:"edges" yaml:"edges"` + Endpoints []EndpointOverlay `json:"endpoints" yaml:"endpoints"` + Ref string `json:"-" yaml:"-"` +} + +type CommonMetadata struct { + Labels map[string]string `json:"labels" yaml:"labels"` + Tags []string `json:"tags" yaml:"tags"` + SLORefs []string `json:"slo_refs" yaml:"slo_refs"` + Attributes map[string]string `json:"attributes" yaml:"attributes"` +} + +type ServiceOverlay struct { + ID string `json:"id" yaml:"id"` + Replicas *int `json:"replicas" yaml:"replicas"` + FailureEligible *bool `json:"failure_eligible" yaml:"failure_eligible"` + CommonMetadata `json:",inline" yaml:",inline"` +} + +type EdgeOverlay struct { + ID string `json:"id" yaml:"id"` + From string `json:"from" yaml:"from"` + To string `json:"to" yaml:"to"` + Kind string `json:"kind" yaml:"kind"` + Blocking *bool `json:"blocking" yaml:"blocking"` + Weight *float64 `json:"weight" yaml:"weight"` + CommonMetadata `json:",inline" yaml:",inline"` +} + +type EndpointOverlay struct { + ID string `json:"id" yaml:"id"` + EntryService string `json:"entry_service" yaml:"entry_service"` + Method string `json:"method" yaml:"method"` + Path string `json:"path" yaml:"path"` + PredicateRef string `json:"predicate_ref" yaml:"predicate_ref"` + Weight *float64 `json:"weight" yaml:"weight"` + CommonMetadata `json:",inline" yaml:",inline"` +} + +func (f *File) Normalize(path string) error { + f.Ref = filepath.ToSlash(filepath.Clean(path)) + f.Name = strings.TrimSpace(f.Name) + if f.Name == "" { + f.Name = strings.TrimSuffix(filepath.Base(path), filepath.Ext(path)) + } + for i := range f.Services { + f.Services[i].ID = strings.TrimSpace(f.Services[i].ID) + if f.Services[i].ID == "" { + return fmt.Errorf("service overlay at index %d has empty id", i) + } + } + for i := range f.Edges { + item := &f.Edges[i] + item.ID = strings.TrimSpace(item.ID) + item.From = strings.TrimSpace(item.From) + item.To = strings.TrimSpace(item.To) + item.Kind = strings.TrimSpace(item.Kind) + if item.ID == "" { + if item.From == "" || item.To == "" || item.Kind == "" || item.Blocking == nil { + return fmt.Errorf("edge overlay at index %d requires id or from/to/kind/blocking", i) + } + item.ID = fmt.Sprintf("%s|%s|%s|%t", item.From, item.To, item.Kind, *item.Blocking) + } + } + for i := range f.Endpoints { + item := &f.Endpoints[i] + item.ID = strings.TrimSpace(item.ID) + item.EntryService = strings.TrimSpace(item.EntryService) + item.Method = strings.ToUpper(strings.TrimSpace(item.Method)) + item.Path = normalizePath(item.Path) + item.PredicateRef = strings.TrimSpace(item.PredicateRef) + if item.ID == "" { + if item.EntryService == "" || item.Method == "" || item.Path == "" { + return fmt.Errorf("endpoint overlay at index %d requires id or entry_service/method/path", i) + } + item.ID = fmt.Sprintf("%s:%s %s", item.EntryService, item.Method, item.Path) + } + } + return nil +} + +func FromReplicas(values map[string]int, ref string) File { + out := File{Name: "replicas", Ref: ref} + for id, replicas := range values { + value := replicas + out.Services = append(out.Services, ServiceOverlay{ID: id, Replicas: &value}) + } + return out +} + +func normalizePath(path string) string { + path = strings.TrimSpace(path) + if path == "" { + return "" + } + if !strings.HasPrefix(path, "/") { + path = "/" + path + } + return path +} diff --git a/internal/overlay/overlay_test.go b/internal/overlay/overlay_test.go new file mode 100644 index 0000000..77d0dcc --- /dev/null +++ b/internal/overlay/overlay_test.go @@ -0,0 +1,18 @@ +package overlay + +import ( + "path/filepath" + "testing" +) + +func TestFileNormalizeCanonicalizesRef(t *testing.T) { + t.Parallel() + + item := File{Name: "test"} + if err := item.Normalize(filepath.Join("configs", "nested", "overlay.yaml")); err != nil { + t.Fatalf("Normalize returned error: %v", err) + } + if item.Ref != "configs/nested/overlay.yaml" { + t.Fatalf("ref mismatch: got=%s", item.Ref) + } +} diff --git a/internal/runtime/engine.go b/internal/runtime/engine.go new file mode 100644 index 0000000..d7f885e --- /dev/null +++ b/internal/runtime/engine.go @@ -0,0 +1,347 @@ +package runtime + +import ( + "context" + "fmt" + "log/slog" + "strings" + "sync" + "time" + + "github.com/MB3R-Lab/Bering/internal/connectors/traces" + "github.com/MB3R-Lab/Bering/internal/discovery" + "github.com/MB3R-Lab/Bering/internal/model" + "github.com/MB3R-Lab/Bering/internal/overlay" + "github.com/MB3R-Lab/Bering/internal/schema" + "github.com/MB3R-Lab/Bering/internal/snapshot" +) + +type EngineConfig struct { + WindowSize time.Duration + MaxInMemorySpans int + LateSpanPolicy string + Sink SnapshotSink + Metrics *Metrics + Logger *slog.Logger + Now func() time.Time + SourceRef string + Sources []snapshot.SourceSummary + Overlays []overlay.File +} + +type Engine struct { + cfg EngineConfig + mu sync.Mutex + current windowState + previous *snapshot.Envelope +} + +type windowState struct { + start time.Time + end time.Time + spans []traces.Span + traces map[string]struct{} + dropped int + late int +} + +func NewEngine(cfg EngineConfig) (*Engine, error) { + if cfg.WindowSize <= 0 { + return nil, fmt.Errorf("window size must be > 0") + } + if cfg.MaxInMemorySpans <= 0 { + return nil, fmt.Errorf("max in-memory spans must be > 0") + } + switch strings.ToLower(strings.TrimSpace(cfg.LateSpanPolicy)) { + case "drop", "current_window": + default: + return nil, fmt.Errorf("unsupported late span policy: %s", cfg.LateSpanPolicy) + } + if cfg.Sink == nil { + return nil, fmt.Errorf("snapshot sink is required") + } + if cfg.Metrics == nil { + cfg.Metrics = NewMetrics() + } + if cfg.Logger == nil { + cfg.Logger = slog.New(slog.NewTextHandler(ioDiscard{}, nil)) + } + if cfg.Now == nil { + cfg.Now = time.Now + } + start, end := alignWindow(cfg.Now().UTC(), cfg.WindowSize) + return &Engine{ + cfg: cfg, + current: windowState{ + start: start, + end: end, + traces: map[string]struct{}{}, + }, + }, nil +} + +func (e *Engine) Ingest(ctx context.Context, spans []traces.Span, receivedAt time.Time) error { + if receivedAt.IsZero() { + receivedAt = e.cfg.Now().UTC() + } + e.cfg.Metrics.AddIngested(len(spans)) + if err := e.flushDue(ctx, receivedAt.UTC()); err != nil { + return err + } + + e.mu.Lock() + defer e.mu.Unlock() + + dropped := 0 + lateDropped := 0 + lateAccepted := 0 + for _, span := range spans { + eventTime := span.EventTime(receivedAt) + if eventTime.Before(e.current.start) { + e.current.late++ + if strings.EqualFold(e.cfg.LateSpanPolicy, "drop") { + e.current.dropped++ + dropped++ + lateDropped++ + continue + } + lateAccepted++ + } + if len(e.current.spans) >= e.cfg.MaxInMemorySpans { + e.current.dropped++ + dropped++ + continue + } + e.current.spans = append(e.current.spans, span) + if span.TraceID != "" { + e.current.traces[span.TraceID] = struct{}{} + } + } + if dropped > 0 { + e.cfg.Metrics.AddDropped(dropped) + } + if lateDropped > 0 || lateAccepted > 0 { + e.cfg.Logger.Warn("late spans observed", + slog.Int("dropped", lateDropped), + slog.Int("accepted_current_window", lateAccepted), + slog.String("policy", e.cfg.LateSpanPolicy), + ) + } + if dropped-lateDropped > 0 { + e.cfg.Logger.Warn("spans dropped due to in-memory buffer limit", slog.Int("dropped", dropped-lateDropped), slog.Int("max_in_memory_spans", e.cfg.MaxInMemorySpans)) + } + return nil +} + +func (e *Engine) FlushDue(ctx context.Context) error { + return e.flushDue(ctx, e.cfg.Now().UTC()) +} + +func (e *Engine) Close(ctx context.Context) error { + return e.flushCurrent(ctx, e.cfg.Now().UTC(), true) +} + +func (e *Engine) flushDue(ctx context.Context, now time.Time) error { + for { + e.mu.Lock() + currentEnd := e.current.end + e.cfg.Metrics.SetWindowLag(now, currentEnd) + shouldFlush := !now.Before(currentEnd) + e.mu.Unlock() + if !shouldFlush { + return nil + } + if err := e.flushCurrent(ctx, now, false); err != nil { + return err + } + } +} + +func (e *Engine) flushCurrent(ctx context.Context, now time.Time, force bool) error { + e.mu.Lock() + current := e.current + if len(current.spans) == 0 && !force { + e.current = newWindow(current.end, e.cfg.WindowSize) + e.mu.Unlock() + return nil + } + if len(current.spans) == 0 && force { + e.mu.Unlock() + return nil + } + e.current = newWindow(current.end, e.cfg.WindowSize) + previous := e.previous + e.mu.Unlock() + + started := e.cfg.Now().UTC() + result, err := discovery.Discover(current.spans, discovery.Options{ + SourceRef: e.cfg.SourceRef, + DiscoveredAt: current.end.Format(time.RFC3339), + Overlays: e.cfg.Overlays, + Sources: e.cfg.Sources, + RuntimeMode: true, + }) + if err != nil { + return fmt.Errorf("build snapshot discovery result: %w", err) + } + topologyVersion, err := snapshot.TopologyDigest(result.Model) + if err != nil { + return fmt.Errorf("compute topology digest: %w", err) + } + env := snapshot.Envelope{ + SnapshotID: snapshot.BuildSnapshotID(current.start.Format(time.RFC3339), current.end.Format(time.RFC3339), topologyVersion), + TopologyVersion: topologyVersion, + WindowStart: current.start.Format(time.RFC3339), + WindowEnd: current.end.Format(time.RFC3339), + Ingest: snapshot.IngestSummary{ + Spans: len(current.spans), + Traces: len(current.traces), + DroppedSpans: current.dropped, + LateSpans: current.late, + }, + Counts: snapshot.Counts{ + Services: len(result.Model.Services), + Edges: len(result.Model.Edges), + Endpoints: len(result.Model.Endpoints), + }, + Coverage: result.Coverage, + Sources: result.Sources, + Discovery: result.Discovery, + Model: result.Model, + Metadata: snapshot.Metadata{ + SourceType: discovery.SourceTypeBering, + SourceRef: e.cfg.SourceRef, + EmittedAt: started.Format(time.RFC3339), + Confidence: result.Coverage.Confidence, + Schema: model.SchemaRef{ + Name: schema.ExpectedSnapshotSchemaName, + Version: schema.ExpectedSnapshotSchemaVersion, + URI: schema.ExpectedSnapshotSchemaURI, + Digest: schema.ExpectedSnapshotSchemaDigest, + }, + }, + } + carryForwardObservationTimes(previous, &env) + env.Diff = snapshot.ComputeDiff(previous, env) + env.SortDeterministic() + if err := e.cfg.Sink.Write(ctx, env); err != nil { + return err + } + finished := e.cfg.Now().UTC() + e.cfg.Metrics.RecordSnapshot(env, finished.Sub(started), finished) + e.cfg.Logger.Info("snapshot emitted", + slog.String("snapshot_id", env.SnapshotID), + slog.String("topology_version", env.TopologyVersion), + slog.String("window_start", env.WindowStart), + slog.String("window_end", env.WindowEnd), + slog.Int("services", env.Counts.Services), + slog.Int("edges", env.Counts.Edges), + slog.Int("endpoints", env.Counts.Endpoints), + slog.Int("dropped_spans", env.Ingest.DroppedSpans), + slog.Int("late_spans", env.Ingest.LateSpans), + ) + copyEnv := env + e.mu.Lock() + e.previous = ©Env + e.mu.Unlock() + return nil +} + +func alignWindow(now time.Time, size time.Duration) (time.Time, time.Time) { + start := now.UTC().Truncate(size) + return start, start.Add(size) +} + +func newWindow(start time.Time, size time.Duration) windowState { + start = start.UTC() + return windowState{start: start, end: start.Add(size), traces: map[string]struct{}{}} +} + +func carryForwardObservationTimes(previous *snapshot.Envelope, current *snapshot.Envelope) { + if previous == nil || current == nil { + return + } + + serviceIndex := make(map[string]snapshot.ServiceRecord, len(previous.Discovery.Services)) + for _, item := range previous.Discovery.Services { + serviceIndex[item.ID] = item + } + for i := range current.Discovery.Services { + if prev, ok := serviceIndex[current.Discovery.Services[i].ID]; ok { + current.Discovery.Services[i].FirstSeen = mergeFirstSeen(prev.FirstSeen, current.Discovery.Services[i].FirstSeen) + current.Discovery.Services[i].LastSeen = mergeLastSeen(prev.LastSeen, current.Discovery.Services[i].LastSeen) + } + } + + edgeIndex := make(map[string]snapshot.EdgeRecord, len(previous.Discovery.Edges)) + for _, item := range previous.Discovery.Edges { + edgeIndex[item.ID] = item + } + for i := range current.Discovery.Edges { + if prev, ok := edgeIndex[current.Discovery.Edges[i].ID]; ok { + current.Discovery.Edges[i].FirstSeen = mergeFirstSeen(prev.FirstSeen, current.Discovery.Edges[i].FirstSeen) + current.Discovery.Edges[i].LastSeen = mergeLastSeen(prev.LastSeen, current.Discovery.Edges[i].LastSeen) + } + } + + endpointIndex := make(map[string]snapshot.EndpointRecord, len(previous.Discovery.Endpoints)) + for _, item := range previous.Discovery.Endpoints { + endpointIndex[item.ID] = item + } + for i := range current.Discovery.Endpoints { + if prev, ok := endpointIndex[current.Discovery.Endpoints[i].ID]; ok { + current.Discovery.Endpoints[i].FirstSeen = mergeFirstSeen(prev.FirstSeen, current.Discovery.Endpoints[i].FirstSeen) + current.Discovery.Endpoints[i].LastSeen = mergeLastSeen(prev.LastSeen, current.Discovery.Endpoints[i].LastSeen) + } + } +} + +func mergeFirstSeen(previous, current string) string { + prevTime, prevOK := parseSeenTime(previous) + currTime, currOK := parseSeenTime(current) + switch { + case prevOK && currOK: + if prevTime.Before(currTime) { + return prevTime.Format(time.RFC3339) + } + return currTime.Format(time.RFC3339) + case prevOK: + return prevTime.Format(time.RFC3339) + case currOK: + return currTime.Format(time.RFC3339) + default: + return "" + } +} + +func mergeLastSeen(previous, current string) string { + prevTime, prevOK := parseSeenTime(previous) + currTime, currOK := parseSeenTime(current) + switch { + case prevOK && currOK: + if prevTime.After(currTime) { + return prevTime.Format(time.RFC3339) + } + return currTime.Format(time.RFC3339) + case prevOK: + return prevTime.Format(time.RFC3339) + case currOK: + return currTime.Format(time.RFC3339) + default: + return "" + } +} + +func parseSeenTime(value string) (time.Time, bool) { + parsed, err := time.Parse(time.RFC3339, strings.TrimSpace(value)) + if err != nil { + return time.Time{}, false + } + return parsed.UTC(), true +} + +type ioDiscard struct{} + +func (ioDiscard) Write(p []byte) (int, error) { + return len(p), nil +} diff --git a/internal/runtime/engine_test.go b/internal/runtime/engine_test.go new file mode 100644 index 0000000..ee992f2 --- /dev/null +++ b/internal/runtime/engine_test.go @@ -0,0 +1,218 @@ +package runtime + +import ( + "context" + "log/slog" + "sync" + "testing" + "time" + + "github.com/MB3R-Lab/Bering/internal/connectors/traces" + "github.com/MB3R-Lab/Bering/internal/snapshot" +) + +type fakeClock struct { + mu sync.Mutex + now time.Time +} + +func (c *fakeClock) Now() time.Time { + c.mu.Lock() + defer c.mu.Unlock() + return c.now +} + +func (c *fakeClock) Advance(d time.Duration) { + c.mu.Lock() + defer c.mu.Unlock() + c.now = c.now.Add(d) +} + +type sinkRecorder struct { + mu sync.Mutex + envs []snapshot.Envelope +} + +func (s *sinkRecorder) Write(_ context.Context, env snapshot.Envelope) error { + s.mu.Lock() + defer s.mu.Unlock() + s.envs = append(s.envs, env) + return nil +} + +func (s *sinkRecorder) Count() int { + s.mu.Lock() + defer s.mu.Unlock() + return len(s.envs) +} + +func (s *sinkRecorder) First() snapshot.Envelope { + s.mu.Lock() + defer s.mu.Unlock() + return s.envs[0] +} + +func (s *sinkRecorder) Last() snapshot.Envelope { + s.mu.Lock() + defer s.mu.Unlock() + return s.envs[len(s.envs)-1] +} + +func TestEngineFlushesWindowIntoSnapshot(t *testing.T) { + t.Parallel() + + clock := &fakeClock{now: time.Date(2026, 3, 11, 12, 0, 0, 0, time.UTC)} + sink := &sinkRecorder{} + engine, err := NewEngine(EngineConfig{ + WindowSize: time.Minute, + MaxInMemorySpans: 10, + LateSpanPolicy: "drop", + Sink: sink, + Metrics: NewMetrics(), + Logger: slog.New(slog.NewTextHandler(ioDiscard{}, nil)), + Now: clock.Now, + SourceRef: "bering://serve?listen=:4318", + }) + if err != nil { + t.Fatalf("NewEngine returned error: %v", err) + } + + span := traces.Span{ + TraceID: "trace-1", + SpanID: "span-1", + Service: "frontend", + Name: "GET /checkout", + Kind: "server", + StartTime: clock.Now(), + EndTime: clock.Now().Add(100 * time.Millisecond), + Attributes: map[string]any{"http.request.method": "GET", "http.route": "/checkout"}, + } + if err := engine.Ingest(context.Background(), []traces.Span{span}, clock.Now()); err != nil { + t.Fatalf("Ingest returned error: %v", err) + } + clock.Advance(61 * time.Second) + if err := engine.FlushDue(context.Background()); err != nil { + t.Fatalf("FlushDue returned error: %v", err) + } + if got, want := sink.Count(), 1; got != want { + t.Fatalf("snapshot count mismatch: got=%d want=%d", got, want) + } + env := sink.First() + if env.Counts.Services != 1 || env.Counts.Endpoints != 1 { + t.Fatalf("unexpected snapshot counts: %+v", env.Counts) + } + if env.Ingest.Spans != 1 { + t.Fatalf("unexpected ingest summary: %+v", env.Ingest) + } +} + +func TestEngineDropsLateSpanWhenConfigured(t *testing.T) { + t.Parallel() + + clock := &fakeClock{now: time.Date(2026, 3, 11, 12, 1, 0, 0, time.UTC)} + sink := &sinkRecorder{} + engine, err := NewEngine(EngineConfig{ + WindowSize: time.Minute, + MaxInMemorySpans: 10, + LateSpanPolicy: "drop", + Sink: sink, + Metrics: NewMetrics(), + Logger: slog.New(slog.NewTextHandler(ioDiscard{}, nil)), + Now: clock.Now, + SourceRef: "bering://serve?listen=:4318", + }) + if err != nil { + t.Fatalf("NewEngine returned error: %v", err) + } + + lateSpan := traces.Span{ + TraceID: "trace-1", + SpanID: "span-1", + Service: "frontend", + Name: "GET /checkout", + Kind: "server", + StartTime: clock.Now().Add(-2 * time.Minute), + EndTime: clock.Now().Add(-2*time.Minute + 100*time.Millisecond), + Attributes: map[string]any{"http.request.method": "GET", "http.route": "/checkout"}, + } + if err := engine.Ingest(context.Background(), []traces.Span{lateSpan}, clock.Now()); err != nil { + t.Fatalf("Ingest returned error: %v", err) + } + clock.Advance(61 * time.Second) + if err := engine.FlushDue(context.Background()); err != nil { + t.Fatalf("FlushDue returned error: %v", err) + } + if got, want := sink.Count(), 0; got != want { + t.Fatalf("expected no snapshot for dropped late spans, got=%d", got) + } +} + +func TestEngineCarriesForwardObservationTimesAcrossSnapshots(t *testing.T) { + t.Parallel() + + clock := &fakeClock{now: time.Date(2026, 3, 11, 12, 0, 0, 0, time.UTC)} + sink := &sinkRecorder{} + engine, err := NewEngine(EngineConfig{ + WindowSize: time.Minute, + MaxInMemorySpans: 10, + LateSpanPolicy: "drop", + Sink: sink, + Metrics: NewMetrics(), + Logger: slog.New(slog.NewTextHandler(ioDiscard{}, nil)), + Now: clock.Now, + SourceRef: "bering://serve?listen=:4318", + }) + if err != nil { + t.Fatalf("NewEngine returned error: %v", err) + } + + firstSpan := traces.Span{ + TraceID: "trace-1", + SpanID: "span-1", + Service: "frontend", + Name: "GET /checkout", + Kind: "server", + StartTime: clock.Now().Add(10 * time.Second), + EndTime: clock.Now().Add(11 * time.Second), + Attributes: map[string]any{"http.request.method": "GET", "http.route": "/checkout"}, + } + if err := engine.Ingest(context.Background(), []traces.Span{firstSpan}, clock.Now()); err != nil { + t.Fatalf("Ingest returned error: %v", err) + } + clock.Advance(61 * time.Second) + if err := engine.FlushDue(context.Background()); err != nil { + t.Fatalf("FlushDue returned error: %v", err) + } + firstEnv := sink.Last() + firstSeen := firstEnv.Discovery.Services[0].FirstSeen + lastSeen := firstEnv.Discovery.Services[0].LastSeen + + secondSpan := traces.Span{ + TraceID: "trace-2", + SpanID: "span-2", + Service: "frontend", + Name: "GET /checkout", + Kind: "server", + StartTime: clock.Now().Add(20 * time.Second), + EndTime: clock.Now().Add(21 * time.Second), + Attributes: map[string]any{"http.request.method": "GET", "http.route": "/checkout"}, + } + if err := engine.Ingest(context.Background(), []traces.Span{secondSpan}, clock.Now()); err != nil { + t.Fatalf("second Ingest returned error: %v", err) + } + clock.Advance(61 * time.Second) + if err := engine.FlushDue(context.Background()); err != nil { + t.Fatalf("second FlushDue returned error: %v", err) + } + + if got, want := sink.Count(), 2; got != want { + t.Fatalf("snapshot count mismatch: got=%d want=%d", got, want) + } + secondEnv := sink.Last() + if got := secondEnv.Discovery.Services[0].FirstSeen; got != firstSeen { + t.Fatalf("first_seen was not carried forward: got=%s want=%s", got, firstSeen) + } + if got := secondEnv.Discovery.Services[0].LastSeen; got == lastSeen { + t.Fatalf("last_seen was not advanced: got=%s", got) + } +} diff --git a/internal/runtime/metrics.go b/internal/runtime/metrics.go new file mode 100644 index 0000000..6a52b79 --- /dev/null +++ b/internal/runtime/metrics.go @@ -0,0 +1,129 @@ +package runtime + +import ( + "net/http" + "time" + + "github.com/MB3R-Lab/Bering/internal/snapshot" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" +) + +type Metrics struct { + registry *prometheus.Registry + + spansIngested prometheus.Counter + spansDropped prometheus.Counter + snapshotsEmitted prometheus.Counter + snapshotBuildDuration prometheus.Histogram + currentServices prometheus.Gauge + currentEdges prometheus.Gauge + currentEndpoints prometheus.Gauge + windowLagSeconds prometheus.Gauge + lastSnapshotUnix prometheus.Gauge + snapshotAgeSeconds prometheus.Gauge + diffAddedServices prometheus.Gauge + diffRemovedServices prometheus.Gauge + diffAddedEdges prometheus.Gauge + diffRemovedEdges prometheus.Gauge + diffAddedEndpoints prometheus.Gauge + diffRemovedEndpoints prometheus.Gauge + diffChangedServices prometheus.Gauge + diffChangedEdges prometheus.Gauge + diffChangedEndpoints prometheus.Gauge +} + +func NewMetrics() *Metrics { + registry := prometheus.NewRegistry() + m := &Metrics{ + registry: registry, + spansIngested: prometheus.NewCounter(prometheus.CounterOpts{Name: "spans_ingested_total", Help: "Total spans accepted by the runtime ingest path."}), + spansDropped: prometheus.NewCounter(prometheus.CounterOpts{Name: "spans_dropped_total", Help: "Total spans dropped due to late policy, buffer pressure, or decode rejection."}), + snapshotsEmitted: prometheus.NewCounter(prometheus.CounterOpts{Name: "snapshots_emitted_total", Help: "Total discovery snapshots written to configured sinks."}), + snapshotBuildDuration: prometheus.NewHistogram(prometheus.HistogramOpts{Name: "snapshot_build_duration_seconds", Help: "Time spent building and writing one snapshot.", Buckets: prometheus.DefBuckets}), + currentServices: prometheus.NewGauge(prometheus.GaugeOpts{Name: "current_services", Help: "Service count in the latest emitted snapshot."}), + currentEdges: prometheus.NewGauge(prometheus.GaugeOpts{Name: "current_edges", Help: "Edge count in the latest emitted snapshot."}), + currentEndpoints: prometheus.NewGauge(prometheus.GaugeOpts{Name: "current_endpoints", Help: "Endpoint count in the latest emitted snapshot."}), + windowLagSeconds: prometheus.NewGauge(prometheus.GaugeOpts{Name: "window_lag_seconds", Help: "How far the runtime is behind the active window end when flushing."}), + lastSnapshotUnix: prometheus.NewGauge(prometheus.GaugeOpts{Name: "last_snapshot_unixtime", Help: "Unix timestamp of the latest emitted snapshot."}), + snapshotAgeSeconds: prometheus.NewGauge(prometheus.GaugeOpts{Name: "snapshot_age_seconds", Help: "Age in seconds of the latest emitted snapshot."}), + diffAddedServices: prometheus.NewGauge(prometheus.GaugeOpts{Name: "diff_added_services", Help: "Services added in the latest snapshot diff."}), + diffRemovedServices: prometheus.NewGauge(prometheus.GaugeOpts{Name: "diff_removed_services", Help: "Services removed in the latest snapshot diff."}), + diffAddedEdges: prometheus.NewGauge(prometheus.GaugeOpts{Name: "diff_added_edges", Help: "Edges added in the latest snapshot diff."}), + diffRemovedEdges: prometheus.NewGauge(prometheus.GaugeOpts{Name: "diff_removed_edges", Help: "Edges removed in the latest snapshot diff."}), + diffAddedEndpoints: prometheus.NewGauge(prometheus.GaugeOpts{Name: "diff_added_endpoints", Help: "Endpoints added in the latest snapshot diff."}), + diffRemovedEndpoints: prometheus.NewGauge(prometheus.GaugeOpts{Name: "diff_removed_endpoints", Help: "Endpoints removed in the latest snapshot diff."}), + diffChangedServices: prometheus.NewGauge(prometheus.GaugeOpts{Name: "diff_changed_services", Help: "Services changed in the latest snapshot diff."}), + diffChangedEdges: prometheus.NewGauge(prometheus.GaugeOpts{Name: "diff_changed_edges", Help: "Edges changed in the latest snapshot diff."}), + diffChangedEndpoints: prometheus.NewGauge(prometheus.GaugeOpts{Name: "diff_changed_endpoints", Help: "Endpoints changed in the latest snapshot diff."}), + } + registry.MustRegister( + m.spansIngested, + m.spansDropped, + m.snapshotsEmitted, + m.snapshotBuildDuration, + m.currentServices, + m.currentEdges, + m.currentEndpoints, + m.windowLagSeconds, + m.lastSnapshotUnix, + m.snapshotAgeSeconds, + m.diffAddedServices, + m.diffRemovedServices, + m.diffAddedEdges, + m.diffRemovedEdges, + m.diffAddedEndpoints, + m.diffRemovedEndpoints, + m.diffChangedServices, + m.diffChangedEdges, + m.diffChangedEndpoints, + ) + return m +} + +func (m *Metrics) Handler() http.Handler { + return promhttp.HandlerFor(m.registry, promhttp.HandlerOpts{EnableOpenMetrics: true}) +} + +func (m *Metrics) AddIngested(count int) { + if count > 0 { + m.spansIngested.Add(float64(count)) + } +} + +func (m *Metrics) AddDropped(count int) { + if count > 0 { + m.spansDropped.Add(float64(count)) + } +} + +func (m *Metrics) SetWindowLag(now, end time.Time) { + if now.IsZero() || end.IsZero() || !now.After(end) { + m.windowLagSeconds.Set(0) + return + } + m.windowLagSeconds.Set(now.Sub(end).Seconds()) +} + +func (m *Metrics) RecordSnapshot(env snapshot.Envelope, buildDuration time.Duration, now time.Time) { + m.snapshotsEmitted.Inc() + m.snapshotBuildDuration.Observe(buildDuration.Seconds()) + m.currentServices.Set(float64(env.Counts.Services)) + m.currentEdges.Set(float64(env.Counts.Edges)) + m.currentEndpoints.Set(float64(env.Counts.Endpoints)) + if emittedAt, err := time.Parse(time.RFC3339, env.Metadata.EmittedAt); err == nil { + m.lastSnapshotUnix.Set(float64(emittedAt.Unix())) + if !now.IsZero() { + m.snapshotAgeSeconds.Set(now.Sub(emittedAt).Seconds()) + } + } + m.diffAddedServices.Set(float64(env.Diff.AddedServices)) + m.diffRemovedServices.Set(float64(env.Diff.RemovedServices)) + m.diffChangedServices.Set(float64(env.Diff.ChangedServices)) + m.diffAddedEdges.Set(float64(env.Diff.AddedEdges)) + m.diffRemovedEdges.Set(float64(env.Diff.RemovedEdges)) + m.diffChangedEdges.Set(float64(env.Diff.ChangedEdges)) + m.diffAddedEndpoints.Set(float64(env.Diff.AddedEndpoints)) + m.diffRemovedEndpoints.Set(float64(env.Diff.RemovedEndpoints)) + m.diffChangedEndpoints.Set(float64(env.Diff.ChangedEndpoints)) +} diff --git a/internal/runtime/service.go b/internal/runtime/service.go new file mode 100644 index 0000000..be65753 --- /dev/null +++ b/internal/runtime/service.go @@ -0,0 +1,182 @@ +package runtime + +import ( + "context" + "fmt" + "log/slog" + "net" + "net/http" + "strings" + "sync/atomic" + "time" + + collecttracev1 "go.opentelemetry.io/proto/otlp/collector/trace/v1" + "google.golang.org/protobuf/encoding/protojson" + "google.golang.org/protobuf/proto" + + "github.com/MB3R-Lab/Bering/internal/config" + "github.com/MB3R-Lab/Bering/internal/connectors/otlp" + "github.com/MB3R-Lab/Bering/internal/discovery" + "github.com/MB3R-Lab/Bering/internal/overlay" + "github.com/MB3R-Lab/Bering/internal/snapshot" +) + +type Service struct { + cfg config.ServeConfig + logger *slog.Logger + metrics *Metrics + engine *Engine + server *http.Server + listener net.Listener + ready atomic.Bool +} + +func NewService(cfg config.ServeConfig, overlays []overlay.File, logger *slog.Logger) (*Service, error) { + metrics := NewMetrics() + if logger == nil { + logger = slog.New(slog.NewTextHandler(ioDiscard{}, nil)) + } + engine, err := NewEngine(EngineConfig{ + WindowSize: cfg.Runtime.WindowSize.Duration(), + MaxInMemorySpans: cfg.Runtime.MaxInMemorySpans, + LateSpanPolicy: cfg.Runtime.LateSpanPolicy, + Sink: FileSink{Directory: cfg.Sink.Directory, LatestPath: cfg.Sink.LatestPath}, + Metrics: metrics, + Logger: logger, + SourceRef: discovery.BuildServeSourceRef(cfg.Server.ListenAddress), + Sources: []snapshot.SourceSummary{{Type: "traces", Connector: otlp.ConnectorName, Ref: discovery.BuildServeSourceRef(cfg.Server.ListenAddress)}}, + Overlays: overlays, + }) + if err != nil { + return nil, err + } + service := &Service{cfg: cfg, logger: logger, metrics: metrics, engine: engine} + mux := http.NewServeMux() + mux.HandleFunc("/v1/traces", service.handleOTLP) + mux.HandleFunc("/healthz", service.handleHealth) + mux.HandleFunc("/readyz", service.handleReady) + mux.Handle("/metrics", metrics.Handler()) + service.server = &http.Server{Handler: mux, ReadHeaderTimeout: 5 * time.Second} + return service, nil +} + +func (s *Service) Addr() string { + if s.listener == nil { + return "" + } + return s.listener.Addr().String() +} + +func (s *Service) Run(ctx context.Context) error { + listener, err := net.Listen("tcp", s.cfg.Server.ListenAddress) + if err != nil { + return fmt.Errorf("listen on %s: %w", s.cfg.Server.ListenAddress, err) + } + s.listener = listener + s.ready.Store(true) + + errCh := make(chan error, 1) + go func() { + ticker := time.NewTicker(s.cfg.Runtime.FlushInterval.Duration()) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + if err := s.engine.FlushDue(context.Background()); err != nil { + errCh <- err + return + } + } + } + }() + go func() { + if err := s.server.Serve(listener); err != nil && err != http.ErrServerClosed { + errCh <- err + } + }() + + s.logger.Info("bering runtime service listening", slog.String("address", listener.Addr().String())) + + select { + case <-ctx.Done(): + s.ready.Store(false) + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := s.server.Shutdown(shutdownCtx); err != nil { + return fmt.Errorf("http shutdown: %w", err) + } + if err := s.engine.Close(shutdownCtx); err != nil { + return fmt.Errorf("flush final snapshot: %w", err) + } + return nil + case err := <-errCh: + s.ready.Store(false) + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = s.server.Shutdown(shutdownCtx) + return err + } +} + +func (s *Service) handleOTLP(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + defer r.Body.Close() + spans, err := otlp.DecodeHTTPRequest(r, s.cfg.Server.MaxRequestBytes) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + if err := s.engine.Ingest(r.Context(), spans, time.Now().UTC()); err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + if err := writeOTLPResponse(w, r); err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } +} + +func (s *Service) handleHealth(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("ok\n")) +} + +func (s *Service) handleReady(w http.ResponseWriter, _ *http.Request) { + if !s.ready.Load() { + http.Error(w, "not ready", http.StatusServiceUnavailable) + return + } + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("ready\n")) +} + +func writeOTLPResponse(w http.ResponseWriter, r *http.Request) error { + resp := &collecttracev1.ExportTraceServiceResponse{} + if stringsContainsJSON(r.Header.Get("Content-Type")) { + raw, err := protojson.Marshal(resp) + if err != nil { + return err + } + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, err = w.Write(raw) + return err + } + raw, err := proto.Marshal(resp) + if err != nil { + return err + } + w.Header().Set("Content-Type", "application/x-protobuf") + w.WriteHeader(http.StatusOK) + _, err = w.Write(raw) + return err +} + +func stringsContainsJSON(contentType string) bool { + return strings.Contains(strings.ToLower(contentType), "json") +} diff --git a/internal/runtime/service_test.go b/internal/runtime/service_test.go new file mode 100644 index 0000000..3506bd4 --- /dev/null +++ b/internal/runtime/service_test.go @@ -0,0 +1,167 @@ +package runtime + +import ( + "bytes" + "context" + "io" + "log/slog" + "net/http" + "os" + "path/filepath" + "strings" + "testing" + "time" + + collecttracev1 "go.opentelemetry.io/proto/otlp/collector/trace/v1" + commonv1 "go.opentelemetry.io/proto/otlp/common/v1" + resourcev1 "go.opentelemetry.io/proto/otlp/resource/v1" + tracev1 "go.opentelemetry.io/proto/otlp/trace/v1" + "google.golang.org/protobuf/proto" + + "github.com/MB3R-Lab/Bering/internal/config" +) + +func TestServiceEndToEndOTLPHTTP(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + cfg := config.DefaultServeConfig() + cfg.Server.ListenAddress = "127.0.0.1:0" + cfg.Runtime.WindowSize = config.Duration(150 * time.Millisecond) + cfg.Runtime.FlushInterval = config.Duration(25 * time.Millisecond) + cfg.Runtime.MaxInMemorySpans = 100 + cfg.Sink.Directory = filepath.Join(dir, "snapshots") + cfg.Sink.LatestPath = filepath.Join(dir, "latest.json") + + service, err := NewService(cfg, nil, slog.New(slog.NewTextHandler(ioDiscard{}, nil))) + if err != nil { + t.Fatalf("NewService returned error: %v", err) + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + errCh := make(chan error, 1) + go func() { errCh <- service.Run(ctx) }() + defer func() { + cancel() + select { + case err := <-errCh: + if err != nil { + t.Fatalf("service returned error: %v", err) + } + case <-time.After(2 * time.Second): + t.Fatal("service did not stop in time") + } + }() + + addr := waitForAddr(t, service, 2*time.Second) + postOTLPSpan(t, "http://"+addr+"/v1/traces") + + waitForFile(t, cfg.Sink.LatestPath, 3*time.Second) + latestRaw, err := os.ReadFile(cfg.Sink.LatestPath) + if err != nil { + t.Fatalf("read latest snapshot: %v", err) + } + if !bytes.Contains(latestRaw, []byte(`"snapshot_id"`)) { + t.Fatalf("latest snapshot missing snapshot_id: %s", latestRaw) + } + + checkStatus(t, "http://"+addr+"/healthz", http.StatusOK) + checkStatus(t, "http://"+addr+"/readyz", http.StatusOK) + metricsBody := readBody(t, "http://"+addr+"/metrics") + if !strings.Contains(metricsBody, "spans_ingested_total") { + t.Fatalf("metrics endpoint missing spans_ingested_total:\n%s", metricsBody) + } + if !strings.Contains(metricsBody, "snapshots_emitted_total") { + t.Fatalf("metrics endpoint missing snapshots_emitted_total:\n%s", metricsBody) + } +} + +func waitForAddr(t *testing.T, service *Service, timeout time.Duration) string { + t.Helper() + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + if addr := service.Addr(); addr != "" { + return addr + } + time.Sleep(10 * time.Millisecond) + } + t.Fatal("service did not publish an address in time") + return "" +} + +func postOTLPSpan(t *testing.T, url string) { + t.Helper() + start := time.Now().UTC() + req := &collecttracev1.ExportTraceServiceRequest{ + ResourceSpans: []*tracev1.ResourceSpans{{ + Resource: &resourcev1.Resource{Attributes: []*commonv1.KeyValue{{Key: "service.name", Value: &commonv1.AnyValue{Value: &commonv1.AnyValue_StringValue{StringValue: "frontend"}}}}}, + ScopeSpans: []*tracev1.ScopeSpans{{ + Spans: []*tracev1.Span{{ + TraceId: []byte{0xaa, 0xbb, 0xcc}, + SpanId: []byte{0x01, 0x02}, + Name: "GET /checkout", + Kind: tracev1.Span_SPAN_KIND_SERVER, + StartTimeUnixNano: uint64(start.UnixNano()), + EndTimeUnixNano: uint64(start.Add(10 * time.Millisecond).UnixNano()), + Attributes: []*commonv1.KeyValue{ + {Key: "http.request.method", Value: &commonv1.AnyValue{Value: &commonv1.AnyValue_StringValue{StringValue: "GET"}}}, + {Key: "http.route", Value: &commonv1.AnyValue{Value: &commonv1.AnyValue_StringValue{StringValue: "/checkout"}}}, + }, + }}, + }}, + }}, + } + raw, err := proto.Marshal(req) + if err != nil { + t.Fatalf("marshal otlp request: %v", err) + } + resp, err := http.Post(url, "application/x-protobuf", bytes.NewReader(raw)) + if err != nil { + t.Fatalf("post otlp span: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + t.Fatalf("unexpected status %d: %s", resp.StatusCode, body) + } +} + +func waitForFile(t *testing.T, path string, timeout time.Duration) { + t.Helper() + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + if _, err := os.Stat(path); err == nil { + return + } + time.Sleep(25 * time.Millisecond) + } + t.Fatalf("file %s was not created in time", path) +} + +func checkStatus(t *testing.T, url string, want int) { + t.Helper() + resp, err := http.Get(url) + if err != nil { + t.Fatalf("get %s: %v", url, err) + } + defer resp.Body.Close() + if resp.StatusCode != want { + body, _ := io.ReadAll(resp.Body) + t.Fatalf("unexpected status for %s: got=%d want=%d body=%s", url, resp.StatusCode, want, body) + } +} + +func readBody(t *testing.T, url string) string { + t.Helper() + resp, err := http.Get(url) + if err != nil { + t.Fatalf("get %s: %v", url, err) + } + defer resp.Body.Close() + body, err := io.ReadAll(resp.Body) + if err != nil { + t.Fatalf("read %s: %v", url, err) + } + return string(body) +} diff --git a/internal/runtime/sink.go b/internal/runtime/sink.go new file mode 100644 index 0000000..0d15439 --- /dev/null +++ b/internal/runtime/sink.go @@ -0,0 +1,42 @@ +package runtime + +import ( + "context" + "fmt" + "path/filepath" + "strings" + "time" + + "github.com/MB3R-Lab/Bering/internal/snapshot" +) + +type SnapshotSink interface { + Write(context.Context, snapshot.Envelope) error +} + +type FileSink struct { + Directory string + LatestPath string +} + +func (s FileSink) Write(_ context.Context, env snapshot.Envelope) error { + name := sanitizeFilename(env.WindowEnd) + "-" + env.SnapshotID + ".json" + path := filepath.Join(s.Directory, name) + if err := snapshot.WriteToFile(path, env); err != nil { + return fmt.Errorf("write snapshot sink file: %w", err) + } + if strings.TrimSpace(s.LatestPath) != "" { + if err := snapshot.WriteToFile(s.LatestPath, env); err != nil { + return fmt.Errorf("write latest snapshot file: %w", err) + } + } + return nil +} + +func sanitizeFilename(ts string) string { + if parsed, err := time.Parse(time.RFC3339, ts); err == nil { + return parsed.UTC().Format("20060102T150405Z") + } + replacer := strings.NewReplacer(":", "", "/", "-", "?", "", "&", "", "=", "", "%", "") + return replacer.Replace(ts) +} diff --git a/internal/schema/constants.go b/internal/schema/constants.go index a56d911..0f6fdd4 100644 --- a/internal/schema/constants.go +++ b/internal/schema/constants.go @@ -5,6 +5,11 @@ const ( ExpectedSchemaVersion = "1.0.0" ExpectedSchemaURI = "https://mb3r-lab.github.io/Bering/schema/model/v1.0.0/model.schema.json" ExpectedSchemaDigest = "sha256:272277c093f37580adcd2dded225bd37c86539d642d7910baad7e4228227d1a7" + + ExpectedSnapshotSchemaName = "io.mb3r.bering.snapshot" + ExpectedSnapshotSchemaVersion = "1.0.0" + ExpectedSnapshotSchemaURI = "https://mb3r-lab.github.io/Bering/schema/snapshot/v1.0.0/snapshot.schema.json" + ExpectedSnapshotSchemaDigest = "sha256:87e4e887ed4a37b72f6136e268b73552eccb92941c4de2c6f3a514dd066ea972" ) type SchemaRef struct { @@ -22,3 +27,12 @@ func ExpectedRef() SchemaRef { Digest: ExpectedSchemaDigest, } } + +func ExpectedSnapshotRef() SchemaRef { + return SchemaRef{ + Name: ExpectedSnapshotSchemaName, + Version: ExpectedSnapshotSchemaVersion, + URI: ExpectedSnapshotSchemaURI, + Digest: ExpectedSnapshotSchemaDigest, + } +} diff --git a/internal/schema/contract_test.go b/internal/schema/contract_test.go index 66001c9..0d269c9 100644 --- a/internal/schema/contract_test.go +++ b/internal/schema/contract_test.go @@ -16,6 +16,14 @@ func TestEmbeddedSchemaDigestMatchesPinned(t *testing.T) { } } +func TestEmbeddedSnapshotSchemaDigestMatchesPinned(t *testing.T) { + t.Parallel() + + if got, want := EmbeddedSnapshotSchemaDigest(), ExpectedSnapshotSchemaDigest; got != want { + t.Fatalf("embedded snapshot schema digest mismatch: got=%s want=%s", got, want) + } +} + func TestValidateStrict(t *testing.T) { t.Parallel() @@ -51,3 +59,17 @@ func TestExpectedSchemaURIVersionPathMatchesConstant(t *testing.T) { t.Fatalf("ExpectedSchemaURI path %q must contain %q", parsed.Path, wantSegment) } } + +func TestExpectedSnapshotSchemaURIVersionPathMatchesConstant(t *testing.T) { + t.Parallel() + + parsed, err := url.Parse(ExpectedSnapshotSchemaURI) + if err != nil { + t.Fatalf("parse ExpectedSnapshotSchemaURI: %v", err) + } + + wantSegment := fmt.Sprintf("/v%s/", ExpectedSnapshotSchemaVersion) + if !strings.Contains(parsed.Path, wantSegment) { + t.Fatalf("ExpectedSnapshotSchemaURI path %q must contain %q", parsed.Path, wantSegment) + } +} diff --git a/internal/schema/schema/snapshot.schema.json b/internal/schema/schema/snapshot.schema.json new file mode 100644 index 0000000..6f9cdf7 --- /dev/null +++ b/internal/schema/schema/snapshot.schema.json @@ -0,0 +1,330 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://mb3r-lab.github.io/Bering/schema/snapshot/v1.0.0/snapshot.schema.json", + "title": "BeringDiscoverySnapshot", + "type": "object", + "required": [ + "snapshot_id", + "topology_version", + "window_start", + "window_end", + "ingest", + "counts", + "coverage", + "sources", + "diff", + "discovery", + "model", + "metadata" + ], + "properties": { + "snapshot_id": { "type": "string" }, + "topology_version": { "type": "string" }, + "window_start": { "type": "string", "format": "date-time" }, + "window_end": { "type": "string", "format": "date-time" }, + "ingest": { + "type": "object", + "required": ["spans", "traces", "dropped_spans", "late_spans"], + "properties": { + "spans": { "type": "integer", "minimum": 0 }, + "traces": { "type": "integer", "minimum": 0 }, + "dropped_spans": { "type": "integer", "minimum": 0 }, + "late_spans": { "type": "integer", "minimum": 0 } + } + }, + "counts": { + "type": "object", + "required": ["services", "edges", "endpoints"], + "properties": { + "services": { "type": "integer", "minimum": 0 }, + "edges": { "type": "integer", "minimum": 0 }, + "endpoints": { "type": "integer", "minimum": 0 } + } + }, + "coverage": { + "type": "object", + "required": ["confidence", "service_support_min", "edge_support_min", "endpoint_support_min"], + "properties": { + "confidence": { "type": "number", "minimum": 0, "maximum": 1 }, + "service_support_min": { "type": "integer", "minimum": 0 }, + "edge_support_min": { "type": "integer", "minimum": 0 }, + "endpoint_support_min": { "type": "integer", "minimum": 0 } + } + }, + "sources": { + "type": "array", + "items": { + "type": "object", + "required": ["type"], + "properties": { + "type": { "type": "string" }, + "connector": { "type": "string" }, + "ref": { "type": "string" }, + "observations": { "type": "integer", "minimum": 0 } + } + } + }, + "diff": { + "type": "object", + "required": [ + "added_services", + "removed_services", + "changed_services", + "added_edges", + "removed_edges", + "changed_edges", + "added_endpoints", + "removed_endpoints", + "changed_endpoints" + ], + "properties": { + "added_services": { "type": "integer", "minimum": 0 }, + "removed_services": { "type": "integer", "minimum": 0 }, + "changed_services": { "type": "integer", "minimum": 0 }, + "added_edges": { "type": "integer", "minimum": 0 }, + "removed_edges": { "type": "integer", "minimum": 0 }, + "changed_edges": { "type": "integer", "minimum": 0 }, + "added_endpoints": { "type": "integer", "minimum": 0 }, + "removed_endpoints": { "type": "integer", "minimum": 0 }, + "changed_endpoints": { "type": "integer", "minimum": 0 } + } + }, + "discovery": { + "type": "object", + "required": ["services", "edges", "endpoints"], + "properties": { + "services": { + "type": "array", + "items": { "$ref": "#/$defs/serviceRecord" } + }, + "edges": { + "type": "array", + "items": { "$ref": "#/$defs/edgeRecord" } + }, + "endpoints": { + "type": "array", + "items": { "$ref": "#/$defs/endpointRecord" } + }, + "overlays": { + "type": "array", + "items": { + "type": "object", + "required": ["name", "precedence"], + "properties": { + "name": { "type": "string" }, + "ref": { "type": "string" }, + "precedence": { "type": "integer", "minimum": 0 } + } + } + } + } + }, + "model": { + "type": "object", + "required": ["services", "edges", "endpoints", "metadata"], + "properties": { + "services": { + "type": "array", + "items": { + "type": "object", + "required": ["id", "name", "replicas"], + "properties": { + "id": { "type": "string" }, + "name": { "type": "string" }, + "replicas": { "type": "integer", "minimum": 0 } + } + } + }, + "edges": { + "type": "array", + "items": { + "type": "object", + "required": ["from", "to", "kind", "blocking"], + "properties": { + "from": { "type": "string" }, + "to": { "type": "string" }, + "kind": { "type": "string", "enum": ["sync", "async"] }, + "blocking": { "type": "boolean" } + } + } + }, + "endpoints": { + "type": "array", + "items": { + "type": "object", + "required": ["id", "entry_service", "success_predicate_ref"], + "properties": { + "id": { "type": "string" }, + "entry_service": { "type": "string" }, + "success_predicate_ref": { "type": "string" } + } + } + }, + "metadata": { + "type": "object", + "required": ["source_type", "source_ref", "discovered_at", "confidence", "schema"], + "properties": { + "source_type": { "type": "string" }, + "source_ref": { "type": "string" }, + "discovered_at": { "type": "string", "format": "date-time" }, + "confidence": { "type": "number", "minimum": 0, "maximum": 1 }, + "schema": { "$ref": "#/$defs/schemaRef" } + } + } + } + }, + "metadata": { + "type": "object", + "required": ["source_type", "source_ref", "emitted_at", "confidence", "schema"], + "properties": { + "source_type": { "type": "string" }, + "source_ref": { "type": "string" }, + "emitted_at": { "type": "string", "format": "date-time" }, + "confidence": { "type": "number", "minimum": 0, "maximum": 1 }, + "schema": { "$ref": "#/$defs/schemaRef" } + } + } + }, + "$defs": { + "schemaRef": { + "type": "object", + "required": ["name", "version", "uri", "digest"], + "properties": { + "name": { "type": "string" }, + "version": { "type": "string" }, + "uri": { "type": "string" }, + "digest": { "type": "string" } + } + }, + "provenance": { + "type": "object", + "required": ["type"], + "properties": { + "type": { "type": "string" }, + "connector": { "type": "string" }, + "name": { "type": "string" }, + "ref": { "type": "string" }, + "precedence": { "type": "integer", "minimum": 0 } + } + }, + "support": { + "type": "object", + "required": ["observations", "trace_count"], + "properties": { + "observations": { "type": "integer", "minimum": 0 }, + "trace_count": { "type": "integer", "minimum": 0 }, + "evidence": { + "type": "array", + "items": { "type": "string" } + } + } + }, + "commonMetadata": { + "type": "object", + "properties": { + "labels": { + "type": "object", + "additionalProperties": { "type": "string" } + }, + "tags": { + "type": "array", + "items": { "type": "string" } + }, + "slo_refs": { + "type": "array", + "items": { "type": "string" } + }, + "attributes": { + "type": "object", + "additionalProperties": { "type": "string" } + } + } + }, + "serviceRecord": { + "type": "object", + "required": ["id", "name", "replicas", "support"], + "properties": { + "id": { "type": "string" }, + "name": { "type": "string" }, + "replicas": { "type": "integer", "minimum": 0 }, + "support": { "$ref": "#/$defs/support" }, + "first_seen": { "type": "string", "format": "date-time" }, + "last_seen": { "type": "string", "format": "date-time" }, + "provenance": { + "type": "array", + "items": { "$ref": "#/$defs/provenance" } + }, + "metadata": { + "allOf": [ + { "$ref": "#/$defs/commonMetadata" }, + { + "type": "object", + "properties": { + "failure_eligible": { "type": "boolean" }, + "replicas_override": { "type": "integer", "minimum": 0 } + } + } + ] + } + } + }, + "edgeRecord": { + "type": "object", + "required": ["id", "from", "to", "kind", "blocking", "support"], + "properties": { + "id": { "type": "string" }, + "from": { "type": "string" }, + "to": { "type": "string" }, + "kind": { "type": "string", "enum": ["sync", "async"] }, + "blocking": { "type": "boolean" }, + "support": { "$ref": "#/$defs/support" }, + "first_seen": { "type": "string", "format": "date-time" }, + "last_seen": { "type": "string", "format": "date-time" }, + "provenance": { + "type": "array", + "items": { "$ref": "#/$defs/provenance" } + }, + "metadata": { + "allOf": [ + { "$ref": "#/$defs/commonMetadata" }, + { + "type": "object", + "properties": { + "weight": { "type": "number" } + } + } + ] + } + } + }, + "endpointRecord": { + "type": "object", + "required": ["id", "entry_service", "support"], + "properties": { + "id": { "type": "string" }, + "entry_service": { "type": "string" }, + "method": { "type": "string" }, + "path": { "type": "string" }, + "support": { "$ref": "#/$defs/support" }, + "first_seen": { "type": "string", "format": "date-time" }, + "last_seen": { "type": "string", "format": "date-time" }, + "provenance": { + "type": "array", + "items": { "$ref": "#/$defs/provenance" } + }, + "metadata": { + "allOf": [ + { "$ref": "#/$defs/commonMetadata" }, + { + "type": "object", + "properties": { + "weight": { "type": "number" }, + "predicate_ref": { "type": "string" } + } + } + ] + } + } + } + } +} diff --git a/internal/schema/schema_sync_test.go b/internal/schema/schema_sync_test.go index 1a97ad1..495568c 100644 --- a/internal/schema/schema_sync_test.go +++ b/internal/schema/schema_sync_test.go @@ -51,3 +51,47 @@ func TestAPISchemaStaysInSyncWithEmbeddedSchema(t *testing.T) { t.Fatalf("schema mismatch: %s and %s must stay semantically identical", internalPath, apiPath) } } + +func TestAPISnapshotSchemaStaysInSyncWithEmbeddedSchema(t *testing.T) { + t.Parallel() + + _, thisFile, _, ok := runtime.Caller(0) + if !ok { + t.Fatal("runtime.Caller failed") + } + + pkgDir := filepath.Dir(thisFile) + internalPath := filepath.Join(pkgDir, "schema", "snapshot.schema.json") + apiPath := filepath.Join(pkgDir, "..", "..", "api", "schema", "snapshot.schema.json") + + internalRaw, err := os.ReadFile(internalPath) + if err != nil { + t.Fatalf("read internal snapshot schema: %v", err) + } + apiRaw, err := os.ReadFile(apiPath) + if err != nil { + t.Fatalf("read api snapshot schema: %v", err) + } + + var internalObj any + if err := json.Unmarshal(internalRaw, &internalObj); err != nil { + t.Fatalf("decode internal snapshot schema json: %v", err) + } + var apiObj any + if err := json.Unmarshal(apiRaw, &apiObj); err != nil { + t.Fatalf("decode api snapshot schema json: %v", err) + } + + internalNorm, err := json.Marshal(internalObj) + if err != nil { + t.Fatalf("normalize internal snapshot schema json: %v", err) + } + apiNorm, err := json.Marshal(apiObj) + if err != nil { + t.Fatalf("normalize api snapshot schema json: %v", err) + } + + if string(internalNorm) != string(apiNorm) { + t.Fatalf("schema mismatch: %s and %s must stay semantically identical", internalPath, apiPath) + } +} diff --git a/internal/schema/validator.go b/internal/schema/validator.go index 9451892..0b84770 100644 --- a/internal/schema/validator.go +++ b/internal/schema/validator.go @@ -16,10 +16,16 @@ import ( //go:embed schema/model.schema.json var canonicalSchema []byte +//go:embed schema/snapshot.schema.json +var canonicalSnapshotSchema []byte + var ( - schemaOnce sync.Once - schemaObj *jsonschema.Schema - schemaErr error + modelSchemaOnce sync.Once + modelSchemaObj *jsonschema.Schema + modelSchemaErr error + snapshotSchemaOnce sync.Once + snapshotSchemaObj *jsonschema.Schema + snapshotSchemaErr error ) func EmbeddedSchema() []byte { @@ -28,74 +34,158 @@ func EmbeddedSchema() []byte { return out } +func EmbeddedSnapshotSchema() []byte { + out := make([]byte, len(canonicalSnapshotSchema)) + copy(out, canonicalSnapshotSchema) + return out +} + func EmbeddedSchemaDigest() string { sum := sha256.Sum256(canonicalSchema) return "sha256:" + hex.EncodeToString(sum[:]) } +func EmbeddedSnapshotSchemaDigest() string { + sum := sha256.Sum256(canonicalSnapshotSchema) + return "sha256:" + hex.EncodeToString(sum[:]) +} + func ValidateStrict(ref SchemaRef) error { - if strings.TrimSpace(ref.Name) == "" { - return fmt.Errorf("metadata.schema.name cannot be empty") + return validateStrictAgainst(ref, ExpectedRef()) +} + +func ValidateSnapshotStrict(ref SchemaRef) error { + return validateStrictAgainst(ref, ExpectedSnapshotRef()) +} + +func ValidateJSON(raw []byte) error { + return validateDocument(raw, ExpectedRef(), compiledModelSchema) +} + +func ValidateSnapshotJSON(raw []byte) error { + doc, err := decodeJSON(raw) + if err != nil { + return fmt.Errorf("decode snapshot json: %w", err) } - if strings.TrimSpace(ref.Version) == "" { - return fmt.Errorf("metadata.schema.version cannot be empty") + compiled, err := compiledSnapshotSchema() + if err != nil { + return fmt.Errorf("compile snapshot schema: %w", err) } - if strings.TrimSpace(ref.URI) == "" { - return fmt.Errorf("metadata.schema.uri cannot be empty") + if err := compiled.Validate(doc); err != nil { + return fmt.Errorf("jsonschema validation failed: %w", err) } - if strings.TrimSpace(ref.Digest) == "" { - return fmt.Errorf("metadata.schema.digest cannot be empty") + ref, err := extractSchemaRef(doc) + if err != nil { + return fmt.Errorf("extract metadata.schema: %w", err) } - - if ref.Name != ExpectedSchemaName { - return fmt.Errorf("schema name mismatch: got %q want %q", ref.Name, ExpectedSchemaName) + if err := ValidateSnapshotStrict(ref); err != nil { + return fmt.Errorf("strict metadata.schema validation failed: %w", err) } - if ref.Version != ExpectedSchemaVersion { - return fmt.Errorf("schema version mismatch: got %q want %q", ref.Version, ExpectedSchemaVersion) + root, ok := doc.(map[string]any) + if !ok { + return fmt.Errorf("snapshot root is not an object") } - if ref.URI != ExpectedSchemaURI { - return fmt.Errorf("schema uri mismatch: got %q want %q", ref.URI, ExpectedSchemaURI) + rawModel, err := json.Marshal(root["model"]) + if err != nil { + return fmt.Errorf("encode nested model: %w", err) } - if ref.Digest != ExpectedSchemaDigest { - return fmt.Errorf("schema digest mismatch: got %q want %q", ref.Digest, ExpectedSchemaDigest) + if err := ValidateJSON(rawModel); err != nil { + return fmt.Errorf("nested model validation failed: %w", err) } return nil } -func ValidateJSON(raw []byte) error { +func ValidateArtifactJSON(raw []byte) error { doc, err := decodeJSON(raw) if err != nil { - return fmt.Errorf("decode model json: %w", err) + return fmt.Errorf("decode artifact json: %w", err) + } + ref, err := extractSchemaRef(doc) + if err != nil { + return fmt.Errorf("extract metadata.schema: %w", err) + } + switch ref.Name { + case ExpectedSchemaName: + return ValidateJSON(raw) + case ExpectedSnapshotSchemaName: + return ValidateSnapshotJSON(raw) + default: + return fmt.Errorf("unsupported artifact schema name: %s", ref.Name) } +} - compiled, err := compiledSchema() +func validateDocument(raw []byte, expected SchemaRef, compiler func() (*jsonschema.Schema, error)) error { + doc, err := decodeJSON(raw) + if err != nil { + return fmt.Errorf("decode model json: %w", err) + } + compiled, err := compiler() if err != nil { return fmt.Errorf("compile canonical schema: %w", err) } if err := compiled.Validate(doc); err != nil { return fmt.Errorf("jsonschema validation failed: %w", err) } - ref, err := extractSchemaRef(doc) if err != nil { return fmt.Errorf("extract metadata.schema: %w", err) } - if err := ValidateStrict(ref); err != nil { + if err := validateStrictAgainst(ref, expected); err != nil { return fmt.Errorf("strict metadata.schema validation failed: %w", err) } return nil } -func compiledSchema() (*jsonschema.Schema, error) { - schemaOnce.Do(func() { +func validateStrictAgainst(ref, expected SchemaRef) error { + if strings.TrimSpace(ref.Name) == "" { + return fmt.Errorf("metadata.schema.name cannot be empty") + } + if strings.TrimSpace(ref.Version) == "" { + return fmt.Errorf("metadata.schema.version cannot be empty") + } + if strings.TrimSpace(ref.URI) == "" { + return fmt.Errorf("metadata.schema.uri cannot be empty") + } + if strings.TrimSpace(ref.Digest) == "" { + return fmt.Errorf("metadata.schema.digest cannot be empty") + } + if ref.Name != expected.Name { + return fmt.Errorf("schema name mismatch: got %q want %q", ref.Name, expected.Name) + } + if ref.Version != expected.Version { + return fmt.Errorf("schema version mismatch: got %q want %q", ref.Version, expected.Version) + } + if ref.URI != expected.URI { + return fmt.Errorf("schema uri mismatch: got %q want %q", ref.URI, expected.URI) + } + if ref.Digest != expected.Digest { + return fmt.Errorf("schema digest mismatch: got %q want %q", ref.Digest, expected.Digest) + } + return nil +} + +func compiledModelSchema() (*jsonschema.Schema, error) { + modelSchemaOnce.Do(func() { compiler := jsonschema.NewCompiler() if err := compiler.AddResource(ExpectedSchemaURI, bytes.NewReader(canonicalSchema)); err != nil { - schemaErr = err + modelSchemaErr = err return } - schemaObj, schemaErr = compiler.Compile(ExpectedSchemaURI) + modelSchemaObj, modelSchemaErr = compiler.Compile(ExpectedSchemaURI) }) - return schemaObj, schemaErr + return modelSchemaObj, modelSchemaErr +} + +func compiledSnapshotSchema() (*jsonschema.Schema, error) { + snapshotSchemaOnce.Do(func() { + compiler := jsonschema.NewCompiler() + if err := compiler.AddResource(ExpectedSnapshotSchemaURI, bytes.NewReader(canonicalSnapshotSchema)); err != nil { + snapshotSchemaErr = err + return + } + snapshotSchemaObj, snapshotSchemaErr = compiler.Compile(ExpectedSnapshotSchemaURI) + }) + return snapshotSchemaObj, snapshotSchemaErr } func decodeJSON(raw []byte) (any, error) { @@ -112,6 +202,14 @@ func decodeJSON(raw []byte) (any, error) { return doc, nil } +func ExtractSchemaRef(raw []byte) (SchemaRef, error) { + doc, err := decodeJSON(raw) + if err != nil { + return SchemaRef{}, err + } + return extractSchemaRef(doc) +} + func extractSchemaRef(doc any) (SchemaRef, error) { root, ok := doc.(map[string]any) if !ok { diff --git a/internal/schema/validator_test.go b/internal/schema/validator_test.go index 4c5019a..db43281 100644 --- a/internal/schema/validator_test.go +++ b/internal/schema/validator_test.go @@ -55,3 +55,60 @@ func TestValidateJSON_StrictDigestFail(t *testing.T) { t.Fatal("expected strict digest validation failure, got nil") } } + +func TestValidateSnapshotJSON_Success(t *testing.T) { + t.Parallel() + + raw := []byte(`{ + "snapshot_id":"snap-123", + "topology_version":"sha256:abc", + "window_start":"2026-03-03T00:00:00Z", + "window_end":"2026-03-03T00:00:00Z", + "ingest":{"spans":1,"traces":1,"dropped_spans":0,"late_spans":0}, + "counts":{"services":1,"edges":0,"endpoints":1}, + "coverage":{"confidence":0.72,"service_support_min":1,"edge_support_min":0,"endpoint_support_min":1}, + "sources":[{"type":"traces","connector":"trace_file","ref":"bering://discover","observations":1}], + "diff":{"added_services":1,"removed_services":0,"changed_services":0,"added_edges":0,"removed_edges":0,"changed_edges":0,"added_endpoints":1,"removed_endpoints":0,"changed_endpoints":0}, + "discovery":{ + "services":[{"id":"frontend","name":"frontend","replicas":1,"support":{"observations":1,"trace_count":1}}], + "edges":[], + "endpoints":[{"id":"frontend:GET /health","entry_service":"frontend","method":"GET","path":"/health","support":{"observations":1,"trace_count":1}}] + }, + "model": { + "services": [{"id":"frontend","name":"frontend","replicas":1}], + "edges": [], + "endpoints": [{"id":"frontend:GET /health","entry_service":"frontend","success_predicate_ref":"frontend:GET /health"}], + "metadata": { + "source_type":"bering", + "source_ref":"bering://discover?input=examples%2Ftraces", + "discovered_at":"2026-03-03T00:00:00Z", + "confidence":0.72, + "schema":{ + "name":"io.mb3r.bering.model", + "version":"1.0.0", + "uri":"https://mb3r-lab.github.io/Bering/schema/model/v1.0.0/model.schema.json", + "digest":"sha256:272277c093f37580adcd2dded225bd37c86539d642d7910baad7e4228227d1a7" + } + } + }, + "metadata": { + "source_type":"bering", + "source_ref":"bering://serve?listen=:4318", + "emitted_at":"2026-03-03T00:00:00Z", + "confidence":0.72, + "schema":{ + "name":"io.mb3r.bering.snapshot", + "version":"1.0.0", + "uri":"https://mb3r-lab.github.io/Bering/schema/snapshot/v1.0.0/snapshot.schema.json", + "digest":"sha256:87e4e887ed4a37b72f6136e268b73552eccb92941c4de2c6f3a514dd066ea972" + } + } +}`) + + if err := ValidateSnapshotJSON(raw); err != nil { + t.Fatalf("ValidateSnapshotJSON returned error: %v", err) + } + if err := ValidateArtifactJSON(raw); err != nil { + t.Fatalf("ValidateArtifactJSON returned error: %v", err) + } +} diff --git a/internal/snapshot/io.go b/internal/snapshot/io.go new file mode 100644 index 0000000..35a0cc4 --- /dev/null +++ b/internal/snapshot/io.go @@ -0,0 +1,50 @@ +package snapshot + +import ( + "bytes" + "encoding/json" + "fmt" + "os" + "path/filepath" + + "github.com/MB3R-Lab/Bering/internal/jsoncanon" +) + +func ParseJSON(raw []byte) (Envelope, error) { + dec := json.NewDecoder(bytes.NewReader(raw)) + dec.DisallowUnknownFields() + var env Envelope + if err := dec.Decode(&env); err != nil { + return Envelope{}, fmt.Errorf("decode snapshot: %w", err) + } + if err := env.ValidateSemantic(); err != nil { + return Envelope{}, fmt.Errorf("semantic validation failed: %w", err) + } + return env, nil +} + +func MarshalCanonical(env Envelope) ([]byte, error) { + if err := env.ValidateSemantic(); err != nil { + return nil, fmt.Errorf("semantic validation failed: %w", err) + } + env.SortDeterministic() + raw, err := jsoncanon.MarshalIndent(env) + if err != nil { + return nil, fmt.Errorf("canonical json marshal failed: %w", err) + } + return raw, nil +} + +func WriteToFile(path string, env Envelope) error { + raw, err := MarshalCanonical(env) + if err != nil { + return err + } + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return fmt.Errorf("create output directory: %w", err) + } + if err := os.WriteFile(path, raw, 0o644); err != nil { + return fmt.Errorf("write snapshot file: %w", err) + } + return nil +} diff --git a/internal/snapshot/snapshot.go b/internal/snapshot/snapshot.go new file mode 100644 index 0000000..bf038e4 --- /dev/null +++ b/internal/snapshot/snapshot.go @@ -0,0 +1,323 @@ +package snapshot + +import ( + "crypto/sha256" + "encoding/hex" + "errors" + "fmt" + "reflect" + "sort" + "strings" + "time" + + "github.com/MB3R-Lab/Bering/internal/model" +) + +type Envelope struct { + SnapshotID string `json:"snapshot_id"` + TopologyVersion string `json:"topology_version"` + WindowStart string `json:"window_start"` + WindowEnd string `json:"window_end"` + Ingest IngestSummary `json:"ingest"` + Counts Counts `json:"counts"` + Coverage CoverageSummary `json:"coverage"` + Sources []SourceSummary `json:"sources"` + Diff DiffSummary `json:"diff"` + Discovery DiscoveryDetails `json:"discovery"` + Model model.ResilienceModel `json:"model"` + Metadata Metadata `json:"metadata"` +} + +type IngestSummary struct { + Spans int `json:"spans"` + Traces int `json:"traces"` + DroppedSpans int `json:"dropped_spans"` + LateSpans int `json:"late_spans"` +} + +type Counts struct { + Services int `json:"services"` + Edges int `json:"edges"` + Endpoints int `json:"endpoints"` +} + +type CoverageSummary struct { + Confidence float64 `json:"confidence"` + ServiceSupportMin int `json:"service_support_min"` + EdgeSupportMin int `json:"edge_support_min"` + EndpointSupportMin int `json:"endpoint_support_min"` +} + +type SourceSummary struct { + Type string `json:"type"` + Connector string `json:"connector,omitempty"` + Ref string `json:"ref,omitempty"` + Observations int `json:"observations,omitempty"` +} + +type DiffSummary struct { + AddedServices int `json:"added_services"` + RemovedServices int `json:"removed_services"` + ChangedServices int `json:"changed_services"` + AddedEdges int `json:"added_edges"` + RemovedEdges int `json:"removed_edges"` + ChangedEdges int `json:"changed_edges"` + AddedEndpoints int `json:"added_endpoints"` + RemovedEndpoints int `json:"removed_endpoints"` + ChangedEndpoints int `json:"changed_endpoints"` +} + +type DiscoveryDetails struct { + Services []ServiceRecord `json:"services"` + Edges []EdgeRecord `json:"edges"` + Endpoints []EndpointRecord `json:"endpoints"` + Overlays []OverlayApplication `json:"overlays,omitempty"` +} + +type OverlayApplication struct { + Name string `json:"name"` + Ref string `json:"ref,omitempty"` + Precedence int `json:"precedence"` +} + +type Provenance struct { + Type string `json:"type"` + Connector string `json:"connector,omitempty"` + Name string `json:"name,omitempty"` + Ref string `json:"ref,omitempty"` + Precedence int `json:"precedence,omitempty"` +} + +type SupportSummary struct { + Observations int `json:"observations"` + TraceCount int `json:"trace_count"` + Evidence []string `json:"evidence,omitempty"` +} + +type ServiceRecord struct { + ID string `json:"id"` + Name string `json:"name"` + Replicas int `json:"replicas"` + Support SupportSummary `json:"support"` + FirstSeen string `json:"first_seen,omitempty"` + LastSeen string `json:"last_seen,omitempty"` + Provenance []Provenance `json:"provenance,omitempty"` + Metadata ServiceMetadata `json:"metadata,omitempty"` +} + +type EdgeRecord struct { + ID string `json:"id"` + From string `json:"from"` + To string `json:"to"` + Kind model.EdgeKind `json:"kind"` + Blocking bool `json:"blocking"` + Support SupportSummary `json:"support"` + FirstSeen string `json:"first_seen,omitempty"` + LastSeen string `json:"last_seen,omitempty"` + Provenance []Provenance `json:"provenance,omitempty"` + Metadata EdgeMetadata `json:"metadata,omitempty"` +} + +type EndpointRecord struct { + ID string `json:"id"` + EntryService string `json:"entry_service"` + Method string `json:"method,omitempty"` + Path string `json:"path,omitempty"` + Support SupportSummary `json:"support"` + FirstSeen string `json:"first_seen,omitempty"` + LastSeen string `json:"last_seen,omitempty"` + Provenance []Provenance `json:"provenance,omitempty"` + Metadata EndpointMetadata `json:"metadata,omitempty"` +} + +type CommonMetadata struct { + Labels map[string]string `json:"labels,omitempty"` + Tags []string `json:"tags,omitempty"` + SLORefs []string `json:"slo_refs,omitempty"` +} + +type ServiceMetadata struct { + CommonMetadata + FailureEligible *bool `json:"failure_eligible,omitempty"` + ReplicasOverride *int `json:"replicas_override,omitempty"` + Attributes map[string]string `json:"attributes,omitempty"` +} + +type EdgeMetadata struct { + CommonMetadata + Weight *float64 `json:"weight,omitempty"` + Attributes map[string]string `json:"attributes,omitempty"` +} + +type EndpointMetadata struct { + CommonMetadata + Weight *float64 `json:"weight,omitempty"` + PredicateRef string `json:"predicate_ref,omitempty"` + Attributes map[string]string `json:"attributes,omitempty"` +} + +type Metadata struct { + SourceType string `json:"source_type"` + SourceRef string `json:"source_ref"` + EmittedAt string `json:"emitted_at"` + Confidence float64 `json:"confidence"` + Schema model.SchemaRef `json:"schema"` +} + +func (e *Envelope) SortDeterministic() { + sort.Slice(e.Sources, func(i, j int) bool { + left, right := e.Sources[i], e.Sources[j] + if left.Type != right.Type { + return left.Type < right.Type + } + if left.Connector != right.Connector { + return left.Connector < right.Connector + } + return left.Ref < right.Ref + }) + sort.Slice(e.Discovery.Services, func(i, j int) bool { return e.Discovery.Services[i].ID < e.Discovery.Services[j].ID }) + sort.Slice(e.Discovery.Edges, func(i, j int) bool { return e.Discovery.Edges[i].ID < e.Discovery.Edges[j].ID }) + sort.Slice(e.Discovery.Endpoints, func(i, j int) bool { return e.Discovery.Endpoints[i].ID < e.Discovery.Endpoints[j].ID }) + sort.Slice(e.Discovery.Overlays, func(i, j int) bool { + if e.Discovery.Overlays[i].Precedence != e.Discovery.Overlays[j].Precedence { + return e.Discovery.Overlays[i].Precedence < e.Discovery.Overlays[j].Precedence + } + return e.Discovery.Overlays[i].Name < e.Discovery.Overlays[j].Name + }) + e.Model.SortDeterministic() +} + +func (e Envelope) ValidateSemantic() error { + if strings.TrimSpace(e.SnapshotID) == "" { + return errors.New("snapshot_id cannot be empty") + } + if strings.TrimSpace(e.TopologyVersion) == "" { + return errors.New("topology_version cannot be empty") + } + if _, err := time.Parse(time.RFC3339, e.WindowStart); err != nil { + return fmt.Errorf("window_start must be RFC3339: %w", err) + } + if _, err := time.Parse(time.RFC3339, e.WindowEnd); err != nil { + return fmt.Errorf("window_end must be RFC3339: %w", err) + } + if err := e.Model.ValidateSemantic(); err != nil { + return fmt.Errorf("model validation failed: %w", err) + } + if strings.TrimSpace(e.Metadata.SourceType) == "" { + return errors.New("metadata.source_type cannot be empty") + } + if strings.TrimSpace(e.Metadata.SourceRef) == "" { + return errors.New("metadata.source_ref cannot be empty") + } + if _, err := time.Parse(time.RFC3339, e.Metadata.EmittedAt); err != nil { + return fmt.Errorf("metadata.emitted_at must be RFC3339: %w", err) + } + if e.Metadata.Confidence < 0 || e.Metadata.Confidence > 1 { + return errors.New("metadata.confidence must be in [0,1]") + } + if e.Ingest.Spans < 0 || e.Ingest.Traces < 0 || e.Ingest.DroppedSpans < 0 || e.Ingest.LateSpans < 0 { + return errors.New("ingest counts must be >= 0") + } + if e.Counts.Services != len(e.Model.Services) { + return fmt.Errorf("counts.services mismatch: got=%d want=%d", e.Counts.Services, len(e.Model.Services)) + } + if e.Counts.Edges != len(e.Model.Edges) { + return fmt.Errorf("counts.edges mismatch: got=%d want=%d", e.Counts.Edges, len(e.Model.Edges)) + } + if e.Counts.Endpoints != len(e.Model.Endpoints) { + return fmt.Errorf("counts.endpoints mismatch: got=%d want=%d", e.Counts.Endpoints, len(e.Model.Endpoints)) + } + if len(e.Discovery.Services) != len(e.Model.Services) { + return fmt.Errorf("discovery.services mismatch: got=%d want=%d", len(e.Discovery.Services), len(e.Model.Services)) + } + if len(e.Discovery.Edges) != len(e.Model.Edges) { + return fmt.Errorf("discovery.edges mismatch: got=%d want=%d", len(e.Discovery.Edges), len(e.Model.Edges)) + } + if len(e.Discovery.Endpoints) != len(e.Model.Endpoints) { + return fmt.Errorf("discovery.endpoints mismatch: got=%d want=%d", len(e.Discovery.Endpoints), len(e.Model.Endpoints)) + } + return nil +} + +func TopologyDigest(mdl model.ResilienceModel) (string, error) { + raw, err := model.MarshalCanonical(mdl) + if err != nil { + return "", err + } + sum := sha256.Sum256(raw) + return "sha256:" + hex.EncodeToString(sum[:]), nil +} + +func BuildSnapshotID(windowStart, windowEnd, topologyVersion string) string { + sum := sha256.Sum256([]byte(strings.TrimSpace(windowStart) + "|" + strings.TrimSpace(windowEnd) + "|" + strings.TrimSpace(topologyVersion))) + return "snap-" + hex.EncodeToString(sum[:12]) +} + +func ComputeDiff(previous *Envelope, current Envelope) DiffSummary { + if previous == nil { + return DiffSummary{ + AddedServices: len(current.Discovery.Services), + AddedEdges: len(current.Discovery.Edges), + AddedEndpoints: len(current.Discovery.Endpoints), + } + } + return DiffSummary{ + AddedServices: diffAdded(serviceMap(previous.Discovery.Services), serviceMap(current.Discovery.Services)), + RemovedServices: diffAdded(serviceMap(current.Discovery.Services), serviceMap(previous.Discovery.Services)), + ChangedServices: diffChanged(serviceMap(previous.Discovery.Services), serviceMap(current.Discovery.Services)), + AddedEdges: diffAdded(edgeMap(previous.Discovery.Edges), edgeMap(current.Discovery.Edges)), + RemovedEdges: diffAdded(edgeMap(current.Discovery.Edges), edgeMap(previous.Discovery.Edges)), + ChangedEdges: diffChanged(edgeMap(previous.Discovery.Edges), edgeMap(current.Discovery.Edges)), + AddedEndpoints: diffAdded(endpointMap(previous.Discovery.Endpoints), endpointMap(current.Discovery.Endpoints)), + RemovedEndpoints: diffAdded(endpointMap(current.Discovery.Endpoints), endpointMap(previous.Discovery.Endpoints)), + ChangedEndpoints: diffChanged(endpointMap(previous.Discovery.Endpoints), endpointMap(current.Discovery.Endpoints)), + } +} + +func serviceMap(items []ServiceRecord) map[string]ServiceRecord { + out := make(map[string]ServiceRecord, len(items)) + for _, item := range items { + out[item.ID] = item + } + return out +} + +func edgeMap(items []EdgeRecord) map[string]EdgeRecord { + out := make(map[string]EdgeRecord, len(items)) + for _, item := range items { + out[item.ID] = item + } + return out +} + +func endpointMap(items []EndpointRecord) map[string]EndpointRecord { + out := make(map[string]EndpointRecord, len(items)) + for _, item := range items { + out[item.ID] = item + } + return out +} + +func diffAdded[T any](left, right map[string]T) int { + count := 0 + for key := range right { + if _, ok := left[key]; !ok { + count++ + } + } + return count +} + +func diffChanged[T any](left, right map[string]T) int { + count := 0 + for key, leftValue := range left { + rightValue, ok := right[key] + if !ok { + continue + } + if !reflect.DeepEqual(leftValue, rightValue) { + count++ + } + } + return count +} diff --git a/internal/snapshot/snapshot_test.go b/internal/snapshot/snapshot_test.go new file mode 100644 index 0000000..1c97118 --- /dev/null +++ b/internal/snapshot/snapshot_test.go @@ -0,0 +1,76 @@ +package snapshot + +import ( + "testing" + "time" + + "github.com/MB3R-Lab/Bering/internal/model" +) + +func TestComputeDiff(t *testing.T) { + t.Parallel() + + base := sampleEnvelope("2026-03-11T10:00:00Z", []ServiceRecord{{ID: "frontend", Name: "frontend", Replicas: 1, Support: SupportSummary{Observations: 2, TraceCount: 1}}}) + next := sampleEnvelope("2026-03-11T10:01:00Z", []ServiceRecord{{ID: "frontend", Name: "frontend", Replicas: 2, Support: SupportSummary{Observations: 3, TraceCount: 2}}, {ID: "checkout", Name: "checkout", Replicas: 1, Support: SupportSummary{Observations: 1, TraceCount: 1}}}) + + diff := ComputeDiff(&base, next) + if diff.AddedServices != 1 { + t.Fatalf("added services mismatch: %+v", diff) + } + if diff.ChangedServices != 1 { + t.Fatalf("changed services mismatch: %+v", diff) + } +} + +func TestMarshalAndParseCanonical(t *testing.T) { + t.Parallel() + + env := sampleEnvelope("2026-03-11T10:00:00Z", []ServiceRecord{{ID: "frontend", Name: "frontend", Replicas: 1, Support: SupportSummary{Observations: 2, TraceCount: 1}}}) + raw, err := MarshalCanonical(env) + if err != nil { + t.Fatalf("MarshalCanonical returned error: %v", err) + } + parsed, err := ParseJSON(raw) + if err != nil { + t.Fatalf("ParseJSON returned error: %v", err) + } + if parsed.SnapshotID != env.SnapshotID { + t.Fatalf("snapshot id mismatch: got=%s want=%s", parsed.SnapshotID, env.SnapshotID) + } +} + +func sampleEnvelope(windowEnd string, services []ServiceRecord) Envelope { + end, _ := time.Parse(time.RFC3339, windowEnd) + mdl := model.ResilienceModel{ + Services: []model.Service{{ID: "frontend", Name: "frontend", Replicas: services[0].Replicas}}, + Edges: []model.Edge{}, + Endpoints: []model.Endpoint{{ID: "frontend:GET /health", EntryService: "frontend", SuccessPredicateRef: "frontend:GET /health"}}, + Metadata: model.Metadata{SourceType: "bering", SourceRef: "bering://test", DiscoveredAt: windowEnd, Confidence: 0.7, Schema: model.SchemaRef{Name: "io.mb3r.bering.model", Version: "1.0.0", URI: "https://mb3r-lab.github.io/Bering/schema/model/v1.0.0/model.schema.json", Digest: "sha256:272277c093f37580adcd2dded225bd37c86539d642d7910baad7e4228227d1a7"}}, + } + mdl.SortDeterministic() + topology, _ := TopologyDigest(mdl) + return Envelope{ + SnapshotID: BuildSnapshotID(windowEnd, windowEnd, topology), + TopologyVersion: topology, + WindowStart: windowEnd, + WindowEnd: windowEnd, + Ingest: IngestSummary{Spans: 2, Traces: 1}, + Counts: Counts{Services: len(mdl.Services), Edges: len(mdl.Edges), Endpoints: len(mdl.Endpoints)}, + Coverage: CoverageSummary{Confidence: 0.7, ServiceSupportMin: 1}, + Sources: []SourceSummary{{Type: "traces", Connector: "otlp_http", Ref: "bering://serve", Observations: 2}}, + Diff: DiffSummary{AddedServices: len(services)}, + Discovery: DiscoveryDetails{ + Services: services, + Edges: []EdgeRecord{}, + Endpoints: []EndpointRecord{{ID: "frontend:GET /health", EntryService: "frontend", Method: "GET", Path: "/health", Support: SupportSummary{Observations: 1, TraceCount: 1}}}, + }, + Model: mdl, + Metadata: Metadata{ + SourceType: "bering", + SourceRef: "bering://serve", + EmittedAt: end.Format(time.RFC3339), + Confidence: 0.7, + Schema: model.SchemaRef{Name: "io.mb3r.bering.snapshot", Version: "1.0.0", URI: "https://mb3r-lab.github.io/Bering/schema/snapshot/v1.0.0/snapshot.schema.json", Digest: "sha256:87e4e887ed4a37b72f6136e268b73552eccb92941c4de2c6f3a514dd066ea972"}, + }, + } +}