@@ -3,6 +3,7 @@ use crate::plugins::node_groups::NodeGroupsPlugin;
33use crate :: store:: core:: StoreContext ;
44use crate :: ServerMode ;
55use log:: { debug, error, info} ;
6+ use shared:: models:: task:: Task ;
67use std:: collections:: HashMap ;
78use std:: sync:: Arc ;
89use std:: time:: Duration ;
@@ -33,6 +34,70 @@ impl MetricsSyncService {
3334 }
3435 }
3536
37+ /// Format task metadata into a structured string for Prometheus labels
38+ /// Example: "model:qwen3-4b|dataset:intellect-2-rl|version:v1"
39+ fn format_task_metadata ( task : & Task ) -> String {
40+ if let Some ( metadata) = & task. metadata {
41+ if let Some ( labels) = & metadata. labels {
42+ if !labels. is_empty ( ) {
43+ return labels
44+ . iter ( )
45+ . map ( |( k, v) | format ! ( "{}:{}" , k, v) )
46+ . collect :: < Vec < _ > > ( )
47+ . join ( "|" ) ;
48+ }
49+ }
50+ }
51+ "" . to_string ( )
52+ }
53+
54+ /// Efficiently get all node-to-group mappings including both group_id and group_config_name
55+ /// Returns a HashMap where key is node_address and value is (group_id, group_config_name)
56+ async fn get_all_node_group_info ( & self ) -> anyhow:: Result < HashMap < String , ( String , String ) > > {
57+ if let Some ( node_groups_plugin) = & self . node_groups_plugin {
58+ // First get all node to group_id mappings
59+ let node_to_group_mappings =
60+ match node_groups_plugin. get_all_node_group_mappings ( ) . await {
61+ Ok ( mappings) => mappings,
62+ Err ( e) => {
63+ error ! ( "Failed to get node group mappings: {}" , e) ;
64+ return Ok ( HashMap :: new ( ) ) ;
65+ }
66+ } ;
67+
68+ // Then get all groups to get their configuration names
69+ let groups = match node_groups_plugin. get_all_groups ( ) . await {
70+ Ok ( groups) => groups,
71+ Err ( e) => {
72+ error ! ( "Failed to get all groups: {}" , e) ;
73+ return Ok ( HashMap :: new ( ) ) ;
74+ }
75+ } ;
76+
77+ // Create a mapping from group_id to configuration_name
78+ let group_id_to_config: HashMap < String , String > = groups
79+ . into_iter ( )
80+ . map ( |group| ( group. id , group. configuration_name ) )
81+ . collect ( ) ;
82+
83+ // Combine the mappings to create node_address -> (group_id, group_config_name)
84+ let mut result = HashMap :: new ( ) ;
85+ for ( node_address, group_id) in node_to_group_mappings {
86+ if let Some ( config_name) = group_id_to_config. get ( & group_id) {
87+ result. insert ( node_address, ( group_id, config_name. clone ( ) ) ) ;
88+ } else {
89+ // If we can't find the config name, still include the group_id
90+ debug ! ( "No configuration name found for group_id: {}" , group_id) ;
91+ result. insert ( node_address, ( group_id, "unknown" . to_string ( ) ) ) ;
92+ }
93+ }
94+
95+ Ok ( result)
96+ } else {
97+ Ok ( HashMap :: new ( ) )
98+ }
99+ }
100+
36101 pub async fn run ( & self ) -> anyhow:: Result < ( ) > {
37102 // Only run the sync service on ProcessorOnly or Full mode instances
38103 if !matches ! (
@@ -86,6 +151,18 @@ impl MetricsSyncService {
86151 . map ( |task| ( task. id . to_string ( ) , task. name . clone ( ) ) )
87152 . collect ( ) ;
88153
154+ let node_to_group_info = if self . node_groups_plugin . is_some ( ) {
155+ match self . get_all_node_group_info ( ) . await {
156+ Ok ( info) => info,
157+ Err ( e) => {
158+ error ! ( "Failed to get node group info: {}" , e) ;
159+ HashMap :: new ( )
160+ }
161+ }
162+ } else {
163+ HashMap :: new ( )
164+ } ;
165+
89166 // Clear existing Prometheus metrics
90167 self . metrics_context . clear_compute_task_metrics ( ) ;
91168
@@ -99,12 +176,19 @@ impl MetricsSyncService {
99176
100177 for ( label, node_metrics) in task_metrics {
101178 for ( node_address, value) in node_metrics {
179+ let ( group_id, group_config_name) = node_to_group_info
180+ . get ( & node_address)
181+ . map ( |( id, config) | ( Some ( id. as_str ( ) ) , Some ( config. as_str ( ) ) ) )
182+ . unwrap_or ( ( None , None ) ) ;
183+
102184 self . metrics_context . record_compute_task_gauge (
103185 & node_address,
104186 & task_id,
105187 & task_name,
106188 & label,
107189 value,
190+ group_id,
191+ group_config_name,
108192 ) ;
109193 total_metrics += 1 ;
110194 }
@@ -150,6 +234,15 @@ impl MetricsSyncService {
150234 self . metrics_context . set_tasks_count ( total_tasks) ;
151235 debug ! ( "Synced task statistics: {} total tasks" , total_tasks) ;
152236
237+ // Sync task info metrics with metadata
238+ for task in & tasks {
239+ let task_id = task. id . to_string ( ) ;
240+ let metadata = Self :: format_task_metadata ( task) ;
241+ self . metrics_context
242+ . set_task_info ( & task_id, & task. name , & metadata) ;
243+ }
244+ debug ! ( "Synced task info metrics with metadata" ) ;
245+
153246 // Sync nodes per task based on node assignments
154247 // Create task name mapping
155248 let task_name_map: HashMap < String , String > = tasks
@@ -202,3 +295,86 @@ impl MetricsSyncService {
202295 Ok ( ( ) )
203296 }
204297}
298+
299+ #[ cfg( test) ]
300+ mod tests {
301+ use super :: * ;
302+ use shared:: models:: task:: { Task , TaskMetadata , TaskState } ;
303+ use std:: collections:: HashMap ;
304+ use uuid:: Uuid ;
305+
306+ #[ test]
307+ fn test_format_task_metadata_with_labels ( ) {
308+ let mut labels = HashMap :: new ( ) ;
309+ labels. insert ( "model" . to_string ( ) , "qwen3-4b" . to_string ( ) ) ;
310+ labels. insert ( "dataset" . to_string ( ) , "intellect-2-rl-dataset" . to_string ( ) ) ;
311+ labels. insert ( "version" . to_string ( ) , "v1" . to_string ( ) ) ;
312+
313+ let task = Task {
314+ id : Uuid :: new_v4 ( ) ,
315+ image : "test" . to_string ( ) ,
316+ name : "test" . to_string ( ) ,
317+ state : TaskState :: PENDING ,
318+ metadata : Some ( TaskMetadata {
319+ labels : Some ( labels) ,
320+ } ) ,
321+ ..Default :: default ( )
322+ } ;
323+
324+ let formatted = MetricsSyncService :: format_task_metadata ( & task) ;
325+
326+ // The format should be key:value pairs separated by |
327+ // Order might vary due to HashMap iteration
328+ assert ! ( formatted. contains( "model:qwen3-4b" ) ) ;
329+ assert ! ( formatted. contains( "dataset:intellect-2-rl-dataset" ) ) ;
330+ assert ! ( formatted. contains( "version:v1" ) ) ;
331+ assert_eq ! ( formatted. matches( '|' ) . count( ) , 2 ) ; // Should have 2 separators for 3 labels
332+ }
333+
334+ #[ test]
335+ fn test_format_task_metadata_empty ( ) {
336+ let task = Task {
337+ id : Uuid :: new_v4 ( ) ,
338+ image : "test" . to_string ( ) ,
339+ name : "test" . to_string ( ) ,
340+ state : TaskState :: PENDING ,
341+ metadata : None ,
342+ ..Default :: default ( )
343+ } ;
344+
345+ let formatted = MetricsSyncService :: format_task_metadata ( & task) ;
346+ assert_eq ! ( formatted, "" ) ;
347+ }
348+
349+ #[ test]
350+ fn test_format_task_metadata_empty_labels ( ) {
351+ let task = Task {
352+ id : Uuid :: new_v4 ( ) ,
353+ image : "test" . to_string ( ) ,
354+ name : "test" . to_string ( ) ,
355+ state : TaskState :: PENDING ,
356+ metadata : Some ( TaskMetadata {
357+ labels : Some ( HashMap :: new ( ) ) ,
358+ } ) ,
359+ ..Default :: default ( )
360+ } ;
361+
362+ let formatted = MetricsSyncService :: format_task_metadata ( & task) ;
363+ assert_eq ! ( formatted, "" ) ;
364+ }
365+
366+ #[ test]
367+ fn test_format_task_metadata_no_labels ( ) {
368+ let task = Task {
369+ id : Uuid :: new_v4 ( ) ,
370+ image : "test" . to_string ( ) ,
371+ name : "test" . to_string ( ) ,
372+ state : TaskState :: PENDING ,
373+ metadata : Some ( TaskMetadata { labels : None } ) ,
374+ ..Default :: default ( )
375+ } ;
376+
377+ let formatted = MetricsSyncService :: format_task_metadata ( & task) ;
378+ assert_eq ! ( formatted, "" ) ;
379+ }
380+ }
0 commit comments