Skip to content

Commit 68fdeb1

Browse files
committed
fix(log_bridge): address PR review feedback
Stable FNV-1a fault_code, uniform node_source_id, sanitized prefix, LRU-bounded reporters, null guard. Cooldown is now ERROR/FATAL-only and keyed by severity so a WARN cannot drop an ERROR escalation. Unique test domain range, docs-build wiring, and a launch_testing integration test for the live /rosout path.
1 parent 260b374 commit 68fdeb1

13 files changed

Lines changed: 629 additions & 84 deletions

File tree

docs/Doxyfile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ INPUT = ../src/ros2_medkit_gateway/include \
1414
../src/ros2_medkit_fault_reporter/src \
1515
../src/ros2_medkit_diagnostic_bridge/include \
1616
../src/ros2_medkit_diagnostic_bridge/src \
17+
../src/ros2_medkit_log_bridge/include \
18+
../src/ros2_medkit_log_bridge/src \
19+
../src/ros2_medkit_action_status_bridge/include \
20+
../src/ros2_medkit_action_status_bridge/src \
1721
../src/ros2_medkit_serialization/include \
1822
../src/ros2_medkit_serialization/src \
1923
../src/ros2_medkit_msgs

docs/api/cpp.rst

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,22 @@ Bridge node that converts ROS 2 /diagnostics messages to FaultManager faults.
8383
.. doxygenclass:: ros2_medkit_diagnostic_bridge::DiagnosticBridgeNode
8484
:members:
8585

86+
ros2_medkit_log_bridge
87+
----------------------
88+
89+
Bridge node that promotes ROS 2 /rosout log entries to FaultManager faults.
90+
91+
.. doxygenclass:: ros2_medkit_log_bridge::LogBridgeNode
92+
:members:
93+
94+
ros2_medkit_action_status_bridge
95+
--------------------------------
96+
97+
Bridge node that turns terminal ROS 2 action goal states into FaultManager faults.
98+
99+
.. doxygenclass:: ros2_medkit_action_status_bridge::ActionStatusBridgeNode
100+
:members:
101+
86102
ros2_medkit_serialization
87103
-------------------------
88104

docs/changelog.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ This page aggregates changelogs from all ros2_medkit packages.
1414

1515
.. include:: ../src/ros2_medkit_diagnostic_bridge/CHANGELOG.rst
1616

17+
.. include:: ../src/ros2_medkit_log_bridge/CHANGELOG.rst
18+
19+
.. include:: ../src/ros2_medkit_action_status_bridge/CHANGELOG.rst
20+
1721
.. include:: ../src/ros2_medkit_serialization/CHANGELOG.rst
1822

1923
.. include:: ../src/ros2_medkit_msgs/CHANGELOG.rst

docs/introduction.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,12 @@ ros2_medkit consists of several ROS 2 packages:
5959
**ros2_medkit_diagnostic_bridge**
6060
Bridge node that converts standard ROS 2 ``/diagnostics`` messages to fault manager faults.
6161

62+
**ros2_medkit_log_bridge**
63+
Bridge node that promotes ROS 2 ``/rosout`` log entries to fault manager faults.
64+
65+
**ros2_medkit_action_status_bridge**
66+
Bridge node that turns terminal ROS 2 action goal states (aborted) into fault manager faults.
67+
6268
**ros2_medkit_msgs**
6369
Message and service definitions for fault management.
6470

src/ros2_medkit_cmake/cmake/ROS2MedkitTestDomain.cmake

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@
2727
# ros2_medkit_topic_beacon: 110 - 119 (10 slots)
2828
# ros2_medkit_graph_provider: 120 - 129 (10 slots)
2929
# ros2_medkit_linux_introspection: 130 - 139 (10 slots)
30-
# ros2_medkit_integration_tests: 140 - 219 (80 slots)
30+
# ros2_medkit_integration_tests: 140 - 209 (70 slots)
31+
# ros2_medkit_log_bridge: 210 - 214 (5 slots, carved from integration_tests)
32+
# ros2_medkit_action_status_bridge: 215 - 219 (5 slots, carved from integration_tests)
3133
# ros2_medkit_opcua: 220 - 229 (10 slots, carved from integration_tests)
3234
# multi-domain tests (secondary): 230 - 232 (3 slots, reserved for peer_aggregation etc.)
3335
#

src/ros2_medkit_integration_tests/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ if(BUILD_TESTING)
110110
set(_INTEG_PORT 9100)
111111

112112
include(ROS2MedkitTestDomain)
113-
medkit_init_test_domains(START 140 END 229)
113+
medkit_init_test_domains(START 140 END 209)
114114

115115
# Feature tests (atomic, independent, 120s timeout)
116116
file(GLOB FEATURE_TESTS test/features/*.test.py)

src/ros2_medkit_log_bridge/CMakeLists.txt

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ ament_export_dependencies(rclcpp rcl_interfaces ros2_medkit_msgs ros2_medkit_fau
8888
if(BUILD_TESTING)
8989
find_package(ament_lint_auto REQUIRED)
9090
find_package(ament_cmake_gtest REQUIRED)
91+
find_package(launch_testing_ament_cmake REQUIRED)
9192

9293
set(ament_cmake_clang_format_CONFIG_FILE "${CMAKE_CURRENT_SOURCE_DIR}/../../.clang-format")
9394
list(APPEND AMENT_LINT_AUTO_EXCLUDE ament_cmake_uncrustify ament_cmake_cpplint ament_cmake_clang_tidy)
@@ -96,7 +97,7 @@ if(BUILD_TESTING)
9697
ros2_medkit_clang_tidy()
9798

9899
include(ROS2MedkitTestDomain)
99-
medkit_init_test_domains(START 90 END 99)
100+
medkit_init_test_domains(START 210 END 214)
100101

101102
ament_add_gtest(test_log_bridge test/test_log_bridge.cpp)
102103
target_link_libraries(test_log_bridge log_bridge_lib)
@@ -108,6 +109,16 @@ if(BUILD_TESTING)
108109
target_link_options(test_log_bridge PRIVATE --coverage)
109110
endif()
110111

112+
# Integration test (launch_testing): fault_manager + bridge + synthetic /rosout
113+
install(DIRECTORY test
114+
DESTINATION share/${PROJECT_NAME}
115+
)
116+
add_launch_test(
117+
test/test_integration.test.py
118+
TARGET test_integration
119+
TIMEOUT 90
120+
)
121+
111122
ros2_medkit_relax_vendor_warnings()
112123
endif()
113124

src/ros2_medkit_log_bridge/README.md

Lines changed: 56 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -21,22 +21,51 @@ above a severity floor to the FaultManager:
2121
| ERROR (40) | `SEVERITY_ERROR` |
2222
| FATAL (50) | `SEVERITY_CRITICAL` |
2323

24-
- `source_id` of each fault is the originating node (the `Log.name` field), via
25-
a per-node `FaultReporter`, so faults attribute correctly and each node gets
26-
its own local debounce.
27-
- `fault_code` is auto-generated as `<PREFIX>_<NODE>_<HASH>`, where the hash is
28-
taken over a normalized message template (numbers / hex / paths stripped) so
29-
the same logical message maps to the same code across occurrences.
30-
31-
## WARN as PREFAILED
32-
33-
The bridge forwards WARN immediately. Whether a WARN shows as `PREFAILED`
34-
(suspected, kept out of the confirmed-fault list) or `CONFIRMED` is decided by
35-
the FaultManager's `confirmation_threshold`, not the bridge. For the
36-
visible-but-quiet behaviour, launch the FaultManager with
37-
`confirmation_threshold:=-2` or lower (or an entity threshold for `LOG_*`
38-
codes). With the shipped default (`-1`), every WARN confirms on first
39-
occurrence.
24+
- `source_id` of each fault is the originating node's fully-qualified name. It
25+
is derived from `Log.name` by taking the first dotted segment (a `Log.name`
26+
may carry a sub-logger suffix, e.g. `controller_manager.resource_manager`, and
27+
node names cannot contain `.`) and prefixing `/`, giving e.g.
28+
`/controller_manager`. The gateway discovers entities by node FQN, so this is
29+
the form that lets a fault (and its snapshots / rosbag) associate with the
30+
entity in the SOVD tree. Each node gets its own per-node `FaultReporter` and
31+
therefore its own client-side debounce.
32+
- `fault_code` is auto-generated as `<PREFIX>_<NODE>_<HASH>`. `<HASH>` is a fixed
33+
FNV-1a 32-bit digest (8 lowercase hex) of a normalized message template
34+
(numbers / hex / paths stripped, isolated single-letter tokens dropped) so the
35+
same logical message maps to the same code across occurrences. `<NODE>` is the
36+
upper-snake of `source_id`. The 8-hex hash is never truncated; if the 64-char
37+
cap is hit the node part is trimmed instead.
38+
39+
> Namespaced-node limitation: `Log.name` encodes a node's namespace with the same
40+
> `.` separator as a sub-logger suffix, so the two are indistinguishable from the
41+
> string alone. `source_id` takes the first dotted segment, which is right for a
42+
> non-namespaced node with a sub-logger but collapses a namespaced node
43+
> (`robot1.planner_server` -> `/robot1`) to its namespace, so same-named nodes in
44+
> different namespaces share one code. Multi-robot fleets typically isolate robots
45+
> by `ROS_DOMAIN_ID` (one gateway per robot, federated by peer aggregation), which
46+
> sidesteps this.
47+
48+
## Forwarding, the LocalFilter, and confirmation
49+
50+
Two independent debounces sit between a log line and a confirmed fault:
51+
52+
1. Per-node `FaultReporter` `LocalFilter` (client-side). WARN is held until
53+
`default_threshold` (3) occurrences within `default_window_sec` (10s).
54+
ERROR/FATAL have severity `>= bypass_severity` (2) and bypass the filter,
55+
forwarding immediately.
56+
2. Bridge `report_cooldown_sec` cooldown, applied only to `ERROR`/`FATAL` (the
57+
levels that bypass the LocalFilter). It forwards the first occurrence of a
58+
`(fault_code, severity)` immediately and suppresses that same pair for
59+
`report_cooldown_sec` (default 5s, `0.0` disables), bounding a flood. `WARN`
60+
is never cooled here (that would starve its LocalFilter threshold counting),
61+
and keying on severity means a `WARN` never suppresses a same-message `ERROR`
62+
escalation.
63+
64+
Whether a forwarded fault then shows as `PREFAILED` (suspected) or `CONFIRMED`
65+
is a separate, gateway-side decision driven by the FaultManager's
66+
`confirmation_threshold` - not by this bridge and not by the client-side
67+
LocalFilter. For visible-but-quiet WARNs, launch the FaultManager with a low
68+
`confirmation_threshold` (or an entity threshold for `LOG_*` codes).
4069

4170
## Hard limitations (by construction)
4271

@@ -59,7 +88,14 @@ ros2 launch ros2_medkit_log_bridge log_bridge.launch.py
5988
| Param | Default | Meaning |
6089
|-------|---------|---------|
6190
| `rosout_topic` | `/rosout` | log topic to subscribe |
62-
| `severity_floor` | `30` (WARN) | minimum level promoted; raise to `40` on chatty / constrained targets |
63-
| `code_prefix` | `LOG` | prefix for generated fault codes |
64-
| `exclude_nodes` | `[]` | node-name substrings to skip |
65-
| `include_only_nodes` | `[]` | if set, only promote these nodes |
91+
| `severity_floor` | `30` (WARN) | minimum level promoted; raise to `40` on chatty / constrained targets. Clamped to `[0, 50]` at load (a value out of range is corrected with a warning) |
92+
| `code_prefix` | `LOG` | prefix for generated fault codes; normalized to `[A-Z0-9_]` at load |
93+
| `exclude_nodes` | `[]` | node-FQN substrings to skip |
94+
| `include_only_nodes` | `[]` | if set, only promote nodes whose FQN matches |
95+
| `max_tracked_nodes` | `512` | cap on per-node reporters; least-recently-used nodes evicted past this |
96+
| `report_cooldown_sec` | `5.0` | per-fault_code forward debounce; `0.0` disables |
97+
98+
`exclude_nodes` / `include_only_nodes` match as **unanchored substrings**
99+
against the node FQN: `planner` matches `/planner_server` and
100+
`/robot1/planner_server`. Use a longer, more specific substring (e.g.
101+
`/planner_server`) to avoid accidental matches.

src/ros2_medkit_log_bridge/config/log_bridge.yaml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,14 @@ log_bridge:
88
severity_floor: 30
99
# Prefix for auto-generated fault codes (<PREFIX>_<NODE>_<HASH>).
1010
code_prefix: "LOG"
11-
# Originating-node name substrings to skip (e.g. noisy debug nodes).
11+
# Originating-node FQN substrings to skip (e.g. noisy debug nodes).
1212
exclude_nodes: []
1313
# If non-empty, ONLY promote logs from nodes matching these substrings.
1414
include_only_nodes: []
15+
# Cap on per-node FaultReporters; least-recently-used nodes are evicted
16+
# past this to bound memory under transient-node churn.
17+
max_tracked_nodes: 512
18+
# Per-fault_code forward debounce (seconds). First occurrence of a code is
19+
# forwarded immediately; the same code within the window is suppressed.
20+
# 0.0 disables. Tames ERROR/FATAL floods, which bypass the per-node filter.
21+
report_cooldown_sec: 5.0

src/ros2_medkit_log_bridge/include/ros2_medkit_log_bridge/log_bridge_node.hpp

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,11 @@
1414

1515
#pragma once
1616

17-
#include <map>
17+
#include <list>
1818
#include <memory>
1919
#include <mutex>
2020
#include <string>
21+
#include <unordered_map>
2122
#include <vector>
2223

2324
#include "rclcpp/rclcpp.hpp"
@@ -30,11 +31,11 @@ namespace ros2_medkit_log_bridge {
3031
///
3132
/// Subscribes to /rosout (rcl_interfaces/msg/Log) and forwards entries at or
3233
/// above a configurable severity floor to the FaultManager, attributing each
33-
/// fault to the originating node (the Log.name field) via a per-source
34+
/// fault to the originating node's fully-qualified name via a per-source
3435
/// FaultReporter. Drop-in compat adapter, same category as
3536
/// ros2_medkit_diagnostic_bridge: native FaultReporter instrumentation stays
3637
/// the canonical path; this bridge is the fallback for nodes that only log.
37-
/// Level mapping and the WARN-as-PREFAILED caveat are documented in README.md.
38+
/// Level mapping and the WARN/LocalFilter caveat are documented in README.md.
3839
///
3940
/// Hard limitation by construction: only sees rclcpp logs that reach /rosout
4041
/// from a still-alive node. Console-only loggers and crash-before-flush are out
@@ -47,19 +48,20 @@ class LogBridgeNode : public rclcpp::Node {
4748
/// Returns false when the level is below the floor / not promotable.
4849
static bool map_level_to_severity(uint8_t log_level, uint8_t severity_floor, uint8_t * severity_out);
4950

50-
/// Auto-generate a stable fault code from the originating node name and the
51+
/// Auto-generate a stable fault code from the originating node's FQN and the
5152
/// log message. Numbers/hex/paths in the message are normalized away so the
5253
/// same logical message maps to the same code across occurrences.
5354
/// Format: <PREFIX>_<NODE>_<HASH>, clamped to medkit's [A-Z0-9_] / 64-char rule.
54-
std::string generate_fault_code(const std::string & node_name, const std::string & message) const;
55+
std::string generate_fault_code(const std::string & source_id, const std::string & message) const;
5556

5657
/// Normalize a log message into a stable template (lowercased, digit/hex/path
57-
/// runs stripped, whitespace collapsed). Exposed for unit testing.
58+
/// runs stripped, whitespace collapsed, isolated single-letter tokens dropped).
59+
/// Exposed for unit testing.
5860
static std::string normalize_message(const std::string & message);
5961

6062
/// Whether a given originating node should be promoted, honouring the
6163
/// include/exclude lists. Exposed for unit testing.
62-
bool node_is_eligible(const std::string & node_name) const;
64+
bool node_is_eligible(const std::string & source_id) const;
6365

6466
/// Map an rcl_interfaces/msg/Log.name (a logger name, e.g. "bt_navigator" or
6567
/// "controller_manager.resource_manager") to the originating node's
@@ -69,29 +71,55 @@ class LogBridgeNode : public rclcpp::Node {
6971
/// the entity in the SOVD tree. Exposed for unit testing.
7072
static std::string node_source_id(const std::string & log_name);
7173

72-
private:
73-
void log_callback(const rcl_interfaces::msg::Log::ConstSharedPtr & msg);
74+
/// FNV-1a 32-bit hash, fixed spec, emitted as 8 lowercase hex chars. Exposed
75+
/// for unit testing so a known input asserts a known constant.
76+
static std::string fnv1a_hex(const std::string & in);
77+
78+
/// Whether a (fault_code, severity) may be forwarded now under the cooldown
79+
/// (first occurrence passes; same code+severity within report_cooldown_sec is
80+
/// suppressed; 0.0 disables). Keyed by severity so a WARN never suppresses a
81+
/// same-message ERROR escalation. Exposed for unit testing.
82+
bool cooldown_allows(const std::string & fault_code, uint8_t severity, rclcpp::Time now);
7483

7584
/// Fetch (or lazily create) the per-source FaultReporter for an originating
7685
/// node, so the fault's source_id is the node that logged, not the bridge.
77-
ros2_medkit_fault_reporter::FaultReporter * reporter_for(const std::string & node_name);
86+
/// Bounded by max_tracked_nodes_ with LRU eviction. Exposed for unit testing.
87+
ros2_medkit_fault_reporter::FaultReporter * reporter_for(const std::string & source_id);
88+
89+
/// Number of currently tracked per-node reporters. Exposed for unit testing.
90+
size_t tracked_reporter_count();
91+
92+
private:
93+
void log_callback(const rcl_interfaces::msg::Log::ConstSharedPtr & msg);
7894

7995
void load_parameters();
8096

8197
static std::string to_upper_snake(const std::string & in, size_t max_len);
8298

8399
rclcpp::Subscription<rcl_interfaces::msg::Log>::SharedPtr log_sub_;
84100

85-
// One FaultReporter per originating node (correct source_id + own LocalFilter).
86-
std::map<std::string, std::unique_ptr<ros2_medkit_fault_reporter::FaultReporter>> reporters_;
101+
// One FaultReporter per originating node (correct source_id + own LocalFilter),
102+
// LRU-ordered: front() is the least-recently-used entry.
103+
struct ReporterEntry {
104+
std::string source_id;
105+
std::unique_ptr<ros2_medkit_fault_reporter::FaultReporter> reporter;
106+
};
107+
std::list<ReporterEntry> reporters_lru_;
108+
std::unordered_map<std::string, std::list<ReporterEntry>::iterator> reporters_;
87109
std::mutex reporters_mutex_;
88110

111+
// Per-fault_code forward cooldown: last time a code was forwarded.
112+
std::unordered_map<std::string, rclcpp::Time> last_forward_;
113+
std::mutex cooldown_mutex_;
114+
89115
// Configuration
90116
std::string rosout_topic_;
91117
uint8_t severity_floor_;
92118
std::string code_prefix_;
93119
std::vector<std::string> exclude_nodes_;
94120
std::vector<std::string> include_only_nodes_;
121+
int max_tracked_nodes_;
122+
double report_cooldown_sec_;
95123
std::string own_node_name_;
96124
};
97125

0 commit comments

Comments
 (0)