Skip to content

Commit ca3b41b

Browse files
authored
Fix "sent trigger to force oximeter collection: QueueFull" flakiness (#9973)
While there are no issues open for it, I experienced this twice in three builds I've done with faster CI instances.
1 parent 38c1302 commit ca3b41b

1 file changed

Lines changed: 45 additions & 8 deletions

File tree

nexus/tests/integration_tests/metrics_querier.rs

Lines changed: 45 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ use omicron_test_utils::dev::poll::wait_for_condition;
2020
use oximeter::Datum;
2121
use oximeter::Measurement;
2222
use oximeter::TimeseriesSchema;
23+
use oximeter_collector::ForcedCollectionError;
2324
use serde::de::DeserializeOwned;
2425
use slog::Logger;
2526
use std::borrow::Cow;
@@ -220,10 +221,28 @@ impl<'a, N> MetricsQuerier<'a, N> {
220221
{
221222
let result = wait_for_condition(
222223
|| async {
223-
self.ctx
224-
.oximeter
225-
.try_force_collect()
226-
.expect("sent trigger to force oximeter collection");
224+
// On faster CI runners we encountered a lot of flakiness caused
225+
// by try_force_collect() returning the QueueFull error.
226+
//
227+
// At the time of writing this, the method just puts a message
228+
// in a bounded channel (with a capacity of 4), with QueueFull
229+
// indicating that the channel reached capacity.
230+
//
231+
// wait_for_condition() will call this every second until the
232+
// condition matches, so if collection is not done within four
233+
// seconds this will error with QueueFull.
234+
//
235+
// In those cases, rather than failing the test we should just
236+
// respect the backpressure and try again the next iteration.
237+
match self.ctx.oximeter.try_force_collect() {
238+
Ok(()) => {}
239+
Err(ForcedCollectionError::QueueFull) => {
240+
return Err(CondCheckError::<()>::NotYet);
241+
}
242+
Err(e) => {
243+
panic!("failed to start oximeter collection: {e:?}");
244+
}
245+
}
227246

228247
let page = objects_list_page_authz::<U>(
229248
&self.ctx.external_client,
@@ -274,10 +293,28 @@ impl<'a, N> MetricsQuerier<'a, N> {
274293
{
275294
let result = wait_for_condition(
276295
|| async {
277-
self.ctx
278-
.oximeter
279-
.try_force_collect()
280-
.expect("sent trigger to force oximeter collection");
296+
// On faster CI runners we encountered a lot of flakiness caused
297+
// by try_force_collect() returning the QueueFull error.
298+
//
299+
// At the time of writing this, the method just puts a message
300+
// in a bounded channel (with a capacity of 4), with QueueFull
301+
// indicating that the channel reached capacity.
302+
//
303+
// wait_for_condition() will call this every second until the
304+
// condition matches, so if collection is not done within four
305+
// seconds this will error with QueueFull.
306+
//
307+
// In those cases, rather than failing the test we should just
308+
// respect the backpressure and try again the next iteration.
309+
match self.ctx.oximeter.try_force_collect() {
310+
Ok(()) => {}
311+
Err(ForcedCollectionError::QueueFull) => {
312+
return Err(CondCheckError::<()>::NotYet);
313+
}
314+
Err(e) => {
315+
panic!("failed to start oximeter collection: {e:?}");
316+
}
317+
}
281318

282319
let tables = match self
283320
.execute_query_once(endpoint, query.to_string())

0 commit comments

Comments
 (0)