From e38b5dd71cc6dc169a2b4624fadd057892ffba1a Mon Sep 17 00:00:00 2001 From: Sahas Subramanian Date: Thu, 7 May 2026 15:00:35 +0000 Subject: [PATCH 1/4] Make Bounds combination methods public Expose combine_parallel, intersect, and merge_if_overlapping on Bounds so external callers can reuse them. Signed-off-by: Sahas Subramanian --- src/bounds.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/bounds.rs b/src/bounds.rs index 183b4f6..71ce3c5 100644 --- a/src/bounds.rs +++ b/src/bounds.rs @@ -34,7 +34,7 @@ impl Bounds { } /// Combines two bounds as if their components were connected in parallel. - pub(crate) fn combine_parallel(&self, other: &Self) -> Vec { + pub fn combine_parallel(&self, other: &Self) -> Vec { if self.intersect(other).is_none() { return vec![self.clone(), other.clone()]; } @@ -67,7 +67,7 @@ impl Bounds { /// Returns the intersection of `self` and `other`, or `None` if the /// intersection is empty. - pub(crate) fn intersect(&self, other: &Self) -> Option { + pub fn intersect(&self, other: &Self) -> Option { let lower = Self::map_or_any(Q::max, self.lower, other.lower); let upper = Self::map_or_any(Q::min, self.upper, other.upper); if let (Some(lower), Some(upper)) = (lower, upper) @@ -80,7 +80,7 @@ impl Bounds { /// If `self` and `other` overlap, returns the smallest single interval /// that contains both; otherwise returns `None`. - pub(crate) fn merge_if_overlapping(&self, other: &Self) -> Option { + pub fn merge_if_overlapping(&self, other: &Self) -> Option { self.intersect(other)?; Some(Bounds { lower: self.lower.and_then(|a| other.lower.map(|b| a.min(b))), From 3fe51dd36ce0f01152886f98edd906e458b3a4d7 Mon Sep 17 00:00:00 2001 From: Sahas Subramanian Date: Thu, 7 May 2026 15:14:18 +0000 Subject: [PATCH 2/4] Connect to microgrid API lazily so startup tolerates absent server Replace the eager Channel::connect call in MicrogridClientHandle::try_new with Endpoint::connect_lazy, so a missing server at startup no longer causes the calling app to exit immediately. Connection errors now surface per-RPC, where the actor's existing per-stream retry loop can recover once the server becomes reachable. Signed-off-by: Sahas Subramanian --- src/client/microgrid_client_handle.rs | 33 +++++++++++++++------------ src/microgrid.rs | 7 ++++-- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/src/client/microgrid_client_handle.rs b/src/client/microgrid_client_handle.rs index c8325f5..c785c50 100644 --- a/src/client/microgrid_client_handle.rs +++ b/src/client/microgrid_client_handle.rs @@ -8,7 +8,7 @@ use chrono::TimeDelta; use tokio::sync::{broadcast, mpsc, oneshot}; -use tonic::transport::Channel; +use tonic::transport::{Channel, Endpoint}; use crate::{ Bounds, Error, @@ -36,20 +36,25 @@ pub struct MicrogridClientHandle { } impl MicrogridClientHandle { - /// Creates a new `MicrogridClientHandle` that connects to the microgrid API - /// at the specified URL. + /// Creates a new `MicrogridClientHandle` for the microgrid API at the + /// specified URL. + /// + /// The connection is established lazily on the first RPC, so this method + /// succeeds even when no server is reachable yet. Per-call errors will + /// surface from the individual RPC methods, and the actor's per-stream + /// retry loop will keep attempting to reconnect telemetry streams. + /// + /// Returns an error only if `url` is not a valid endpoint URL. pub async fn try_new(url: impl Into) -> Result { - let client = match MicrogridClient::::connect(url.into()).await { - Ok(t) => t, - Err(e) => { - tracing::error!("Could not connect to server: {e}"); - return Err(Error::connection_failure(format!( - "Could not connect to server: {e}" - ))); - } - }; - - Ok(Self::new_from_client(client)) + let url = url.into(); + let channel = Endpoint::from_shared(url.clone()) + .map_err(|e| { + Error::connection_failure(format!("Invalid microgrid API URL {url}: {e}")) + })? + .connect_lazy(); + Ok(Self::new_from_client(MicrogridClient::::new( + channel, + ))) } pub fn new_from_client(client: impl MicrogridApiClient) -> Self { diff --git a/src/microgrid.rs b/src/microgrid.rs index 3f102dc..c8075dd 100644 --- a/src/microgrid.rs +++ b/src/microgrid.rs @@ -21,8 +21,11 @@ impl Microgrid { /// Creates a new `Microgrid` instance with the given microgrid API URL and /// logical meter configuration. /// - /// Returns an error if the URL is unreachable, or if the component graph - /// cannot be created with the given configuration. + /// The microgrid API connection is established lazily and connection or + /// component-graph build errors during setup are retried indefinitely, so + /// this call blocks until the server is reachable and returns valid data. + /// Returns an error only if the URL is malformed or if the provided + /// logical meter configuration is invalid. pub async fn try_new( url: impl Into, config: LogicalMeterConfig, From 734d0b208178b228ff4b46274de7508b5accf77c Mon Sep 17 00:00:00 2001 From: Sahas Subramanian Date: Thu, 7 May 2026 15:57:18 +0000 Subject: [PATCH 3/4] Retry component-graph setup in LogicalMeterHandle::try_new Wrap the entire fetch-and-build sequence (list components, list connections, build the component graph) in a single retry loop that sleeps 3 seconds between attempts and keeps trying until it succeeds. This way, transient API failures and graph-build errors stemming from incomplete server-side data both clear themselves up without aborting the calling app at startup. Signed-off-by: Sahas Subramanian --- src/logical_meter/logical_meter_handle.rs | 61 +++++++++++++++++------ 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/src/logical_meter/logical_meter_handle.rs b/src/logical_meter/logical_meter_handle.rs index 25f79f0..469d24b 100644 --- a/src/logical_meter/logical_meter_handle.rs +++ b/src/logical_meter/logical_meter_handle.rs @@ -13,6 +13,7 @@ use crate::{ }; use frequenz_microgrid_component_graph::{self, ComponentGraph}; use std::collections::BTreeSet; +use std::time::Duration; use tokio::sync::mpsc; use super::{LogicalMeterConfig, logical_meter_actor::LogicalMeterActor}; @@ -26,6 +27,11 @@ pub struct LogicalMeterHandle { impl LogicalMeterHandle { /// Creates a new LogicalMeter instance. + /// + /// Listing the components and connections from the API and building the + /// component graph is retried indefinitely with a 3 second backoff, so + /// this call blocks until the server is reachable and returns data that + /// forms a valid graph. Returns an error only if `config` is invalid. pub async fn try_new( client: MicrogridClientHandle, config: LogicalMeterConfig, @@ -39,21 +45,19 @@ impl LogicalMeterHandle { clock: C, ) -> Result { let (sender, receiver) = mpsc::channel(8); - let graph = ComponentGraph::try_new( - client.list_electrical_components(vec![], vec![]).await?, - client - .list_electrical_component_connections(vec![], vec![]) - .await?, - frequenz_microgrid_component_graph::ComponentGraphConfig { - allow_component_validation_failures: true, - allow_unconnected_components: true, - allow_unspecified_inverters: false, - disable_fallback_components: false, - }, - ) - .map_err(|e| { - Error::component_graph_error(format!("Unable to create a component graph: {e}")) - })?; + const RETRY_DELAY: Duration = Duration::from_secs(3); + let graph = loop { + match build_component_graph(&client).await { + Ok(g) => break g, + Err(reason) => { + tracing::warn!( + "Microgrid logical-meter setup failed, retrying in {:?}: {reason}", + RETRY_DELAY + ); + tokio::time::sleep(RETRY_DELAY).await; + } + } + }; let logical_meter = LogicalMeterActor::try_new(receiver, client, config, clock)?; @@ -174,6 +178,33 @@ impl LogicalMeterHandle { } } +/// Lists the components and connections from the API and builds the +/// component graph. Errors from each step are stringified with a prefix so +/// the retry loop can log a concise reason. +async fn build_component_graph( + client: &MicrogridClientHandle, +) -> Result, String> { + let components = client + .list_electrical_components(vec![], vec![]) + .await + .map_err(|e| format!("fetching components failed: {e}"))?; + let connections = client + .list_electrical_component_connections(vec![], vec![]) + .await + .map_err(|e| format!("fetching component connections failed: {e}"))?; + ComponentGraph::try_new( + components, + connections, + frequenz_microgrid_component_graph::ComponentGraphConfig { + allow_component_validation_failures: true, + allow_unconnected_components: true, + allow_unspecified_inverters: false, + disable_fallback_components: false, + }, + ) + .map_err(|e| format!("building component graph failed: {e}")) +} + #[cfg(test)] mod tests { use chrono::TimeDelta; From a06d7c9fea6afc053f71ec0b77ff73ba41079db1 Mon Sep 17 00:00:00 2001 From: Sahas Subramanian Date: Thu, 7 May 2026 15:58:17 +0000 Subject: [PATCH 4/4] Update release notes Signed-off-by: Sahas Subramanian --- RELEASE_NOTES.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index c6a1766..f6da5c2 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -6,11 +6,13 @@ ## Upgrading - +- `MicrogridClientHandle::try_new`, `LogicalMeterHandle::try_new`, and `Microgrid::try_new` no longer return an error when the microgrid API server is unreachable at startup or when the server returns data that doesn't yet form a valid component graph; instead they wait for the server to recover. Callers that relied on a quick failure to detect a misconfigured or unavailable endpoint should wrap the call in `tokio::time::timeout` (or equivalent) to bound the wait. URL validation still fails fast: a malformed endpoint URL is still surfaced as `ConnectionFailure` from `MicrogridClientHandle::try_new`, and an invalid `LogicalMeterConfig` still surfaces synchronously from `LogicalMeterHandle::try_new`. ## New Features - +- The microgrid client now tolerates the API server being absent or returning incomplete data at startup. `MicrogridClientHandle::try_new` establishes the gRPC connection lazily, so it succeeds regardless of whether the server is reachable; transient stream errors are then handled by the existing per-stream retry loop. `LogicalMeterHandle::try_new` (and therefore `Microgrid::try_new`) wraps the entire component-graph setup — listing components, listing connections, and building the graph — in a single retry loop that sleeps 3 seconds between attempts, so applications block waiting for the server and a valid graph instead of exiting with an error. + +- `Bounds::combine_parallel`, `Bounds::intersect`, and `Bounds::merge_if_overlapping` are now public, allowing external callers to combine bounds without going through higher-level types. ## Bug Fixes