Skip to content

Commit 7f188ef

Browse files
authored
feat(observability): propagate W3C trace context across nico-api's network boundaries (#2700)
Implements W3C Trace Context propagation for nico-api so a request traced upstream keeps one trace as it passes through nico-api and into the services it calls. - **Propagator:** install the standard W3C `TraceContextPropagator` at startup (the OTel default is no-op). - **Ingress (REST + gRPC):** one shared per-request layer extracts the inbound `traceparent`/`tracestate` and parents the request span to it; no valid inbound context → fresh root. - **Egress:** gRPC clients are wrapped in a tower `TraceInjectService`; reqwest clients go through `reqwest-tracing`'s middleware; the two OAuth2 token providers inject manually (the `oauth2` crate builds their requests). - **Enable flag unchanged:** the local `tracing-enabled` switch stays the master control — an inbound `sampled` flag can't force local recording. Per-client details and how to add a new client are in `docs/observability/tracing.md`. ## Related issues #2438 ## Type of Change - [x] **Add** - New feature or capability - [ ] **Change** - Changes in existing functionality - [ ] **Fix** - Bug fixes - [ ] **Remove** - Removed features or deprecated functionality - [ ] **Internal** - Internal changes (refactoring, tests, docs, etc.) ## Breaking Changes - [ ] **This PR contains breaking changes** ## Testing - [x] Unit tests added/updated - [ ] Integration tests added/updated - [ ] Manual testing performed - [ ] No testing required (docs, internal refactor, etc.) ## Additional Notes - **Scope is the nico-api process.** `bmc-proxy` and `hardware-health` are intentionally **not** instrumented: they build no span exporter, so injecting there would be dead code. `docs/observability/tracing.md` §1.7 documents how to add propagation to a new client if that changes. - New crate `trace-propagation` holds the shared glue + tower middleware. - **`tracing.md` marker references are a doc-staleness fix, not part of this feature.** The doc still described the sampler's old `code.namespace` marker; #2268 replaced it with `carbide.trace_root` without updating the doc. The `code.namespace`→`carbide.trace_root` doc edits just correct that. (The doc's `ParentBased` mention is also updated, since this PR unwraps the sampler.) - Dependencies added: `opentelemetry-http` 0.31, `reqwest-middleware` 0.5, `reqwest-tracing` 0.7. --------- Signed-off-by: Adnan Dosani <258082943+adnandnv@users.noreply.github.com> Co-authored-by: Adnan Dosani <258082943+adnandnv@users.noreply.github.com>
1 parent 9383cc7 commit 7f188ef

33 files changed

Lines changed: 1007 additions & 76 deletions

File tree

Cargo.lock

Lines changed: 63 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ sqlx-macros-core = { version = "0.9" }
5252

5353
opentelemetry = { version = "0.31.0", features = ["logs"] }
5454
opentelemetry_sdk = { version = "0.31.0", features = ["logs", "rt-tokio"] }
55+
opentelemetry-http = "0.31.0"
5556
opentelemetry-otlp = { version = "0.31.1", features = ["metrics"] }
5657
opentelemetry-prometheus = "0.31.0"
5758
opentelemetry-semantic-conventions = "0.31.0"
@@ -194,6 +195,10 @@ ratatui = "0.30.0"
194195
rcgen = "0.14.7"
195196
regex = "1.11.2"
196197
reqwest = { default-features = false, version = "0.13.3" }
198+
reqwest-middleware = "0.5"
199+
reqwest-tracing = { version = "0.7", default-features = false, features = [
200+
"opentelemetry_0_31",
201+
] }
197202
resolv-conf = "0.7.0"
198203
ringbuf = "0.5.0"
199204
rsa = "0.9.9"

crates/api-core/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ nras = { path = "../nras" }
7777
spancounter = { path = "../spancounter" }
7878
state-controller = { path = "../state-controller" }
7979
sqlx-query-tracing = { path = "../sqlx-query-tracing" }
80+
trace-propagation = { path = "../trace-propagation" }
8081

8182
# External dependencies. PLEASE KEEP ALPHABETIZED ORDER.
8283
arc-swap = { workspace = true }
@@ -133,6 +134,8 @@ reqwest = { workspace = true, default-features = false, features = [
133134
"rustls",
134135
"stream",
135136
] }
137+
reqwest-middleware = { workspace = true }
138+
reqwest-tracing = { workspace = true }
136139
rsa = { workspace = true }
137140
rumqttc = { workspace = true }
138141
rustls = { workspace = true }

crates/api-core/src/handlers/redfish.rs

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ pub(crate) async fn create_client(
393393
rpc::forge::BmcMetaDataGetResponse,
394394
http::Uri,
395395
HeaderMap,
396-
reqwest::Client,
396+
reqwest_middleware::ClientWithMiddleware,
397397
),
398398
CarbideError,
399399
> {
@@ -450,15 +450,20 @@ pub(crate) async fn create_client(
450450
.connect_timeout(std::time::Duration::from_secs(5)) // Limit connections to 5 seconds
451451
.timeout(std::time::Duration::from_secs(60)); // Limit the overall request to 60 seconds
452452

453-
match builder.build() {
453+
let client = match builder.build() {
454454
Ok(client) => client,
455455
Err(err) => {
456456
tracing::error!(%err, "build_http_client");
457457
return Err(CarbideError::internal(format!(
458458
"Http building failed: {err}"
459459
)));
460460
}
461-
}
461+
};
462+
// The `reqwest-tracing` middleware injects the current span's W3C trace context into every
463+
// outgoing request (#2438).
464+
reqwest_middleware::ClientBuilder::new(client)
465+
.with(reqwest_tracing::TracingMiddleware::default())
466+
.build()
462467
};
463468
Ok((metadata, new_uri, headers, http_client))
464469
}
@@ -558,15 +563,15 @@ impl TestBehavior {
558563
}
559564
}
560565

561-
// Subset of the data we care about from reqwest::Error, so that we can mock it (we can't build our
562-
// own reqwest::Error as its constructors are all private.)
566+
// Subset of the data we care about from the HTTP error, so that we can mock it (we can't build our
567+
// own reqwest error as its constructors are all private.)
563568
pub struct RequestErrorInfo {
564569
pub status_code: Option<http::status::StatusCode>,
565570
pub description: String,
566571
}
567572

568-
impl From<reqwest::Error> for RequestErrorInfo {
569-
fn from(e: reqwest::Error) -> Self {
573+
impl From<reqwest_middleware::Error> for RequestErrorInfo {
574+
fn from(e: reqwest_middleware::Error) -> Self {
570575
Self {
571576
status_code: e.status(),
572577
description: e.to_string(),

crates/api-core/src/logging/api_logs.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,13 @@ where
163163
tenant.organization_id = tracing::field::Empty,
164164
);
165165

166+
// Continue any distributed trace the caller started: make the inbound W3C trace
167+
// context (`traceparent`/`tracestate`) the parent of this request span. The shared
168+
// tower layer serves both the gRPC and REST listeners, so this single extractor
169+
// covers both ingress paths. No-op when there is no valid inbound context, leaving
170+
// the request span a fresh root (issue #2438).
171+
trace_propagation::set_span_parent_from_headers(&request_span, request.headers());
172+
166173
// Try to extract the gRPC service and method from the URI
167174
let mut grpc_method: Option<String> = None;
168175
let mut grpc_service: Option<String> = None;

0 commit comments

Comments
 (0)