Skip to content

Commit cc0e22d

Browse files
committed
fix(fault_manager): address review on rosbag crash-safety
Inject the storage probe as a seam and add direct tests for the fallback and self-disable branches (mcap is in CI so the real probe never reached them). Surface the probe failure reason + install hint at WARN, with a distinct message when the always-shipped sqlite3 baseline itself fails. Make mcap a test-only dependency and document the automatic fallback; default bulk-data mimetype to sqlite3 for bags without a format field.
1 parent e526ce4 commit cc0e22d

6 files changed

Lines changed: 151 additions & 30 deletions

File tree

docs/tutorials/snapshots.rst

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,16 @@ Saves resources but may miss context if fault confirms before buffer fills.
454454
.. note::
455455

456456
The ``"mcap"`` format requires ``rosbag2_storage_mcap`` to be installed.
457-
If not available, use ``"sqlite3"`` (default).
457+
``"sqlite3"`` (the default) is always shipped with rosbag2 and needs no extra
458+
package.
459+
460+
You do not have to switch formats manually: if a configured backend's plugin
461+
is unavailable at startup, the FaultManager logs a warning naming the missing
462+
package and automatically falls back to ``"sqlite3"`` for black-box capture.
463+
If no storage backend is usable at all, rosbag capture self-disables (the node
464+
keeps running and freeze-frame snapshots are unaffected) instead of crashing.
465+
466+
To use mcap (e.g. for Foxglove), install the plugin:
458467

459468
.. code-block:: bash
460469

src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/rosbag_capture.hpp

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,11 @@
1616

1717
#include <atomic>
1818
#include <deque>
19+
#include <functional>
1920
#include <map>
2021
#include <memory>
2122
#include <mutex>
23+
#include <optional>
2224
#include <set>
2325
#include <string>
2426
#include <vector>
@@ -53,13 +55,19 @@ struct BufferedMessage {
5355
/// - on_fault_cleared() deletes bag file if auto_cleanup enabled
5456
class RosbagCapture {
5557
public:
58+
/// Probe a rosbag2 storage backend. Returns std::nullopt when the backend is
59+
/// usable, or the failure reason (never throws). Injectable so tests can force a
60+
/// backend unavailable without depending on which plugins CI happens to install.
61+
using StorageProbeFn = std::function<std::optional<std::string>(const std::string & format)>;
62+
5663
/// Create rosbag capture
5764
/// @param node ROS 2 node for creating subscriptions and timers
5865
/// @param storage Fault storage for persisting bag file metadata
5966
/// @param config Rosbag configuration
6067
/// @param snapshot_config Snapshot configuration (for topic resolution when topics="config")
68+
/// @param storage_probe Optional storage-backend probe override (default: real probe)
6169
RosbagCapture(rclcpp::Node * node, FaultStorage * storage, const RosbagConfig & config,
62-
const SnapshotConfig & snapshot_config);
70+
const SnapshotConfig & snapshot_config, StorageProbeFn storage_probe = {});
6371

6472
~RosbagCapture();
6573

@@ -143,10 +151,13 @@ class RosbagCapture {
143151
/// Timer callback for retrying topic discovery
144152
void discovery_retry_callback();
145153

146-
/// Probe whether a rosbag2 storage plugin is available by opening a throwaway
147-
/// bag. Returns false (never throws) when the plugin is not installed, so the
148-
/// caller can degrade gracefully instead of terminating the node.
149-
bool storage_format_available(const std::string & format) const;
154+
/// Default storage probe: opens a throwaway bag for @p format. Returns
155+
/// std::nullopt when usable, or the failure reason (never throws), so the caller
156+
/// can degrade gracefully instead of terminating the node.
157+
std::optional<std::string> default_storage_probe(const std::string & format) const;
158+
159+
/// Storage-backend probe (the default real probe, or a test override).
160+
StorageProbeFn storage_probe_;
150161

151162
rclcpp::Node * node_;
152163
FaultStorage * storage_;

src/ros2_medkit_fault_manager/package.xml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@
1818
<depend>nlohmann-json-dev</depend>
1919
<depend>rosbag2_cpp</depend>
2020
<depend>rosbag2_storage</depend>
21-
<depend>rosbag2_storage_mcap</depend>
2221

2322
<test_depend>ament_lint_auto</test_depend>
2423
<test_depend>ament_lint_common</test_depend>
24+
<!-- mcap is an opt-in storage backend (sqlite3 is the always-shipped default with
25+
a runtime fallback); only the rosbag integration test needs the plugin present -->
26+
<test_depend>rosbag2_storage_mcap</test_depend>
2527
<test_depend>ament_cmake_clang_format</test_depend>
2628
<test_depend>ament_cmake_clang_tidy</test_depend>
2729
<test_depend>ament_cmake_gtest</test_depend>

src/ros2_medkit_fault_manager/src/rosbag_capture.cpp

Lines changed: 57 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,39 @@
1818
#include <unistd.h>
1919

2020
#include <algorithm>
21+
#include <cstdlib>
2122
#include <filesystem>
23+
#include <optional>
2224
#include <rosbag2_cpp/writer.hpp>
2325
#include <rosbag2_storage/storage_options.hpp>
2426
#include <set>
2527
#include <sstream>
28+
#include <utility>
2629

2730
#include "ros2_medkit_fault_manager/time_utils.hpp"
2831

2932
namespace ros2_medkit_fault_manager {
3033

3134
namespace {
3235

36+
/// Actionable install hint for an unavailable storage backend.
37+
std::string storage_plugin_hint(const std::string & format) {
38+
if (format == "mcap") {
39+
const char * distro = std::getenv("ROS_DISTRO");
40+
const std::string d = (distro && *distro) ? distro : "$ROS_DISTRO";
41+
return "install ros-" + d + "-rosbag2-storage-mcap";
42+
}
43+
return "check the rosbag2 storage plugin installation";
44+
}
45+
46+
/// Bound the probe reason so a verbose pluginlib error does not flood the log.
47+
std::string truncate_reason(const std::string & reason, size_t max_len = 200) {
48+
if (reason.size() <= max_len) {
49+
return reason;
50+
}
51+
return reason.substr(0, max_len) + "...";
52+
}
53+
3354
/// Custom deleter for rcutils_uint8_array_t that calls rcutils_uint8_array_fini
3455
struct Uint8ArrayDeleter {
3556
void operator()(rcutils_uint8_array_t * array) const {
@@ -85,7 +106,7 @@ std::shared_ptr<rosbag2_storage::SerializedBagMessage> create_bag_message(const
85106
} // namespace
86107

87108
RosbagCapture::RosbagCapture(rclcpp::Node * node, FaultStorage * storage, const RosbagConfig & config,
88-
const SnapshotConfig & snapshot_config)
109+
const SnapshotConfig & snapshot_config, StorageProbeFn storage_probe)
89110
: node_(node), storage_(storage), config_(config), snapshot_config_(snapshot_config) {
90111
if (!node_) {
91112
throw std::invalid_argument("RosbagCapture requires a valid node pointer");
@@ -94,6 +115,10 @@ RosbagCapture::RosbagCapture(rclcpp::Node * node, FaultStorage * storage, const
94115
throw std::invalid_argument("RosbagCapture requires a valid storage pointer");
95116
}
96117

118+
storage_probe_ = storage_probe ? std::move(storage_probe) : [this](const std::string & f) {
119+
return default_storage_probe(f);
120+
};
121+
97122
if (!config_.enabled) {
98123
RCLCPP_INFO(node_->get_logger(), "RosbagCapture disabled");
99124
return;
@@ -108,21 +133,34 @@ RosbagCapture::RosbagCapture(rclcpp::Node * node, FaultStorage * storage, const
108133
RCLCPP_WARN(node_->get_logger(), "Unknown rosbag storage format '%s'; using 'sqlite3'", config_.format.c_str());
109134
config_.format = "sqlite3";
110135
}
111-
if (!storage_format_available(config_.format)) {
112-
if (config_.format != "sqlite3" && storage_format_available("sqlite3")) {
136+
137+
if (auto probe_err = storage_probe_(config_.format)) {
138+
const std::string reason = truncate_reason(*probe_err);
139+
if (config_.format == "sqlite3") {
140+
// The always-shipped baseline failed: an environment-level problem (broken
141+
// rosbag2 base install / disk / permissions), not a missing optional plugin.
113142
RCLCPP_WARN(node_->get_logger(),
114-
"Rosbag storage format '%s' is unavailable (see debug log for the probe error); falling back "
115-
"to 'sqlite3' for black-box capture",
116-
config_.format.c_str());
117-
config_.format = "sqlite3";
118-
} else {
143+
"sqlite3 rosbag storage is unavailable (%s); the rosbag2 base install may be broken. "
144+
"Black-box rosbag capture disabled (freeze-frame snapshots still work)",
145+
reason.c_str());
146+
config_.enabled = false;
147+
return;
148+
}
149+
if (auto sqlite_err = storage_probe_("sqlite3")) {
119150
RCLCPP_WARN(node_->get_logger(),
120-
"No usable rosbag storage backend ('%s' and 'sqlite3' both unavailable); black-box rosbag "
121-
"capture disabled (freeze-frame snapshots still work)",
122-
config_.format.c_str());
151+
"No usable rosbag storage backend: '%s' (%s) and sqlite3 (%s) both failed to load; black-box "
152+
"rosbag capture disabled (freeze-frame snapshots still work)",
153+
config_.format.c_str(), reason.c_str(), truncate_reason(*sqlite_err).c_str());
123154
config_.enabled = false;
124155
return;
125156
}
157+
// Explicitly-configured (non-default) format unavailable: name the fix so an
158+
// operator who chose e.g. mcap for Foxglove is not silently downgraded.
159+
RCLCPP_WARN(node_->get_logger(),
160+
"Rosbag storage format '%s' is unavailable (%s); %s. Falling back to 'sqlite3' for black-box "
161+
"capture (sqlite3 bags are not directly Foxglove-readable)",
162+
config_.format.c_str(), reason.c_str(), storage_plugin_hint(config_.format).c_str());
163+
config_.format = "sqlite3";
126164
}
127165

128166
RCLCPP_INFO(node_->get_logger(), "RosbagCapture initialized (duration=%.1fs, after=%.1fs, lazy_start=%s, format=%s)",
@@ -721,30 +759,28 @@ void RosbagCapture::post_fault_timer_callback() {
721759
current_bag_path_.clear();
722760
}
723761

724-
bool RosbagCapture::storage_format_available(const std::string & format) const {
725-
// Probe the plugin by opening a throwaway bag. Returns false (never throws)
726-
// when the plugin is missing, so the caller can fall back instead of crashing.
762+
std::optional<std::string> RosbagCapture::default_storage_probe(const std::string & format) const {
763+
// Probe the plugin by opening a throwaway bag. Returns the failure reason (never
764+
// throws) when the plugin is missing or unusable, so the caller can fall back or
765+
// self-disable instead of crashing.
727766
const std::string test_path =
728767
std::filesystem::temp_directory_path().string() + "/.rosbag_format_test_" + std::to_string(getpid());
729768

730-
bool ok = false;
769+
std::optional<std::string> error;
731770
try {
732771
rosbag2_cpp::Writer writer;
733772
rosbag2_storage::StorageOptions opts;
734773
opts.uri = test_path;
735774
opts.storage_id = format;
736775
writer.open(opts);
737-
ok = true;
738776
} catch (const std::exception & e) {
739-
// Keep the reason (plugin missing vs. I/O / permission) for diagnosis; the
740-
// probe itself stays non-fatal.
741-
RCLCPP_DEBUG(node_->get_logger(), "Rosbag storage format '%s' probe failed: %s", format.c_str(), e.what());
742-
ok = false;
777+
// Keep the reason (plugin missing vs. I/O / permission) for the caller's log.
778+
error = e.what();
743779
}
744780

745781
std::error_code ec;
746782
std::filesystem::remove_all(test_path, ec);
747-
return ok;
783+
return error;
748784
}
749785

750786
} // namespace ros2_medkit_fault_manager

src/ros2_medkit_fault_manager/test/test_rosbag_capture.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <chrono>
1818
#include <filesystem>
1919
#include <memory>
20+
#include <optional>
2021
#include <string>
2122
#include <thread>
2223

@@ -122,6 +123,64 @@ TEST_F(RosbagCaptureTest, ConstructorFallsBackOnUnknownFormat) {
122123
EXPECT_EQ(rb->config().format, "sqlite3");
123124
}
124125

126+
// The crash-safety branches below force the storage probe via an injected double,
127+
// because mcap is installed in CI so the real probe cannot reach them there.
128+
129+
// @verifies REQ_INTEROP_088
130+
TEST_F(RosbagCaptureTest, ConfiguredFormatUnavailableFallsBackToSqlite3) {
131+
// A known format (mcap) whose plugin is unavailable degrades to sqlite3 and
132+
// capture stays enabled - the exact scenario the crash-safety change exists for.
133+
auto rosbag_config = create_rosbag_config();
134+
rosbag_config.format = "mcap";
135+
auto snapshot_config = create_snapshot_config();
136+
RosbagCapture::StorageProbeFn probe = [](const std::string & f) -> std::optional<std::string> {
137+
if (f == "mcap") {
138+
return std::string("simulated: mcap plugin not found");
139+
}
140+
return std::nullopt; // sqlite3 usable
141+
};
142+
std::shared_ptr<RosbagCapture> rb;
143+
EXPECT_NO_THROW(
144+
rb = std::make_shared<RosbagCapture>(node_.get(), storage_.get(), rosbag_config, snapshot_config, probe));
145+
ASSERT_NE(rb, nullptr);
146+
EXPECT_TRUE(rb->is_enabled());
147+
EXPECT_EQ(rb->config().format, "sqlite3");
148+
}
149+
150+
// @verifies REQ_INTEROP_088
151+
TEST_F(RosbagCaptureTest, NoUsableBackendDisablesCaptureWithoutCrashing) {
152+
// When neither the configured format nor sqlite3 is usable, capture self-disables
153+
// and the node keeps running (no throw out of the constructor).
154+
auto rosbag_config = create_rosbag_config();
155+
rosbag_config.format = "mcap";
156+
auto snapshot_config = create_snapshot_config();
157+
RosbagCapture::StorageProbeFn probe = [](const std::string &) -> std::optional<std::string> {
158+
return std::string("simulated: backend unavailable");
159+
};
160+
std::shared_ptr<RosbagCapture> rb;
161+
EXPECT_NO_THROW(
162+
rb = std::make_shared<RosbagCapture>(node_.get(), storage_.get(), rosbag_config, snapshot_config, probe));
163+
ASSERT_NE(rb, nullptr);
164+
EXPECT_FALSE(rb->is_enabled());
165+
}
166+
167+
// @verifies REQ_INTEROP_088
168+
TEST_F(RosbagCaptureTest, Sqlite3BaselineUnavailableDisablesCapture) {
169+
// When the always-shipped sqlite3 baseline itself fails to load, capture
170+
// self-disables rather than crashing the node.
171+
auto rosbag_config = create_rosbag_config();
172+
rosbag_config.format = "sqlite3";
173+
auto snapshot_config = create_snapshot_config();
174+
RosbagCapture::StorageProbeFn probe = [](const std::string &) -> std::optional<std::string> {
175+
return std::string("simulated: rosbag2 base install broken");
176+
};
177+
std::shared_ptr<RosbagCapture> rb;
178+
EXPECT_NO_THROW(
179+
rb = std::make_shared<RosbagCapture>(node_.get(), storage_.get(), rosbag_config, snapshot_config, probe));
180+
ASSERT_NE(rb, nullptr);
181+
EXPECT_FALSE(rb->is_enabled());
182+
}
183+
125184
// @verifies REQ_INTEROP_088
126185
TEST_F(RosbagCaptureTest, ConstructorAcceptsMcapFormat) {
127186
auto rosbag_config = create_rosbag_config();

src/ros2_medkit_gateway/src/http/handlers/bulkdata_handlers.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,9 @@ BulkDataHandlers::list_descriptors(const http::TypedRequest & req) {
227227
dto::Collection<dto::BulkDataDescriptor> response;
228228
for (const auto & rosbag : all_rosbags) {
229229
std::string fault_code = rosbag.value("fault_code", "");
230-
std::string format = rosbag.value("format", "mcap");
230+
// Default to sqlite3 (the FaultManager default) when a bag predates the
231+
// persisted format field; the per-bag metadata normally carries the real one.
232+
std::string format = rosbag.value("format", "sqlite3");
231233
uint64_t size_bytes = rosbag.value("size_bytes", uint64_t{0});
232234
double duration_sec = rosbag.value("duration_sec", 0.0);
233235

@@ -347,7 +349,9 @@ http::Result<http::BinaryResponse> BulkDataHandlers::download(const http::TypedR
347349
}
348350

349351
std::string file_path = rosbag_result.data["file_path"].get<std::string>();
350-
std::string format = rosbag_result.data.value("format", "mcap");
352+
// Default to sqlite3 (the FaultManager default) for bags predating the format
353+
// field; metadata normally carries the real one persisted at capture time.
354+
std::string format = rosbag_result.data.value("format", "sqlite3");
351355
mimetype = get_rosbag_mimetype(format);
352356
filename = fault_code + "." + format;
353357

0 commit comments

Comments
 (0)