2020use std:: time:: Duration ;
2121
2222use backfill:: {
23- BackfillClient , BackfillError , IntoTaskHandlerResult , JobCompleteContext , JobFailContext ,
24- JobPermanentlyFailContext , JobStartContext , LifecycleHooks , TaskHandler , WorkerConfig , WorkerContext , WorkerRunner ,
25- WorkerShutdownContext , WorkerStartContext , enqueue_fast,
23+ BackfillClient , BackfillError , HookRegistry , IntoTaskHandlerResult , JobComplete , JobCompleteContext , JobFail ,
24+ JobFailContext , JobPermanentlyFail , JobPermanentlyFailContext , JobStart , JobStartContext , Plugin , TaskHandler ,
25+ WorkerConfig , WorkerContext , WorkerRunner , WorkerShutdown , WorkerShutdownContext , WorkerStart , WorkerStartContext ,
26+ enqueue_fast,
2627} ;
2728use serde:: { Deserialize , Serialize } ;
2829use tokio_util:: sync:: CancellationToken ;
@@ -37,95 +38,97 @@ use tokio_util::sync::CancellationToken;
3738#[ derive( Clone ) ]
3839struct MetricsPlugin ;
3940
40- impl LifecycleHooks for MetricsPlugin {
41- async fn on_worker_start ( & self , ctx : WorkerStartContext ) {
42- metrics:: gauge!( "backfill_worker_active" ) . set ( 1.0 ) ;
43- log:: info!( "Worker {} started - metrics recording enabled" , ctx. worker_id) ;
44- }
45-
46- async fn on_worker_shutdown ( & self , ctx : WorkerShutdownContext ) {
47- metrics:: gauge!( "backfill_worker_active" ) . set ( 0.0 ) ;
48- log:: info!( "Worker {} shutdown (reason: {:?})" , ctx. worker_id, ctx. reason) ;
49- }
50-
51- async fn on_job_start ( & self , ctx : JobStartContext ) {
52- let task = ctx. job . task_identifier ( ) ;
53- let attempt = ctx. job . attempts ( ) ;
54-
55- metrics:: counter!( "jobs_started" , "task" => task. clone( ) ) . increment ( 1 ) ;
56-
57- log:: debug!( "Job started: {} (attempt {})" , task, attempt) ;
58-
59- // Track wait time (time from creation to start)
60- let created_at = ctx. job . created_at ( ) ;
61- let wait_time = chrono:: Utc :: now ( ) . signed_duration_since ( * created_at) . num_milliseconds ( ) as f64 / 1000.0 ;
62-
63- metrics:: histogram!( "job_wait_time_seconds" , "task" => task. clone( ) ) . record ( wait_time) ;
64- }
65-
66- async fn on_job_complete ( & self , ctx : JobCompleteContext ) {
67- let task = ctx. job . task_identifier ( ) ;
68- let attempt = ctx. job . attempts ( ) ;
69- let duration = ctx. duration . as_secs_f64 ( ) ;
70-
71- metrics:: counter!( "jobs_completed" , "task" => task. clone( ) , "attempt" => attempt. to_string( ) ) . increment ( 1 ) ;
72-
73- metrics:: histogram!( "job_duration_seconds" , "task" => task. clone( ) , "status" => "success" ) . record ( duration) ;
74-
75- log:: info!(
76- "Job completed: {} (attempt {}, duration: {:.2}s)" ,
77- task,
78- attempt,
79- duration
80- ) ;
81- }
82-
83- async fn on_job_fail ( & self , ctx : JobFailContext ) {
84- let task = ctx. job . task_identifier ( ) ;
85- let attempt = ctx. job . attempts ( ) ;
86- let will_retry = ctx. will_retry ;
87-
88- // Use will_retry to distinguish between transient failures and final failures
89- let status = if will_retry { "retrying" } else { "failed" } ;
90-
91- metrics:: counter!(
92- "jobs_failed" ,
93- "task" => task. clone( ) ,
94- "attempt" => attempt. to_string( ) ,
95- "will_retry" => status
96- )
97- . increment ( 1 ) ;
98-
99- // Classify the error type for more detailed metrics
100- let error_type = classify_error ( & ctx. error ) ;
101- metrics:: counter!(
102- "job_errors_by_type" ,
103- "task" => task. clone( ) ,
104- "error_type" => error_type
105- )
106- . increment ( 1 ) ;
107-
108- log:: warn!(
109- "Job failed: {} (attempt {}, will_retry: {}, error: {})" ,
110- task,
111- attempt,
112- will_retry,
113- ctx. error
114- ) ;
115- }
116-
117- async fn on_job_permanently_fail ( & self , ctx : JobPermanentlyFailContext ) {
118- let task = ctx. job . task_identifier ( ) ;
119- let final_attempt = ctx. job . attempts ( ) ;
120-
121- metrics:: counter!( "jobs_permanently_failed" , "task" => task. clone( ) ) . increment ( 1 ) ;
122-
123- log:: error!(
124- "Job permanently failed: {} (final attempt: {}, error: {})" ,
125- task,
126- final_attempt,
127- ctx. error
128- ) ;
41+ impl Plugin for MetricsPlugin {
42+ fn register ( self , hooks : & mut HookRegistry ) {
43+ hooks. on ( WorkerStart , |ctx : WorkerStartContext | async move {
44+ metrics:: gauge!( "backfill_worker_active" ) . set ( 1.0 ) ;
45+ log:: info!( "Worker {} started - metrics recording enabled" , ctx. worker_id) ;
46+ } ) ;
47+
48+ hooks. on ( WorkerShutdown , |ctx : WorkerShutdownContext | async move {
49+ metrics:: gauge!( "backfill_worker_active" ) . set ( 0.0 ) ;
50+ log:: info!( "Worker {} shutdown (reason: {:?})" , ctx. worker_id, ctx. reason) ;
51+ } ) ;
52+
53+ hooks. on ( JobStart , |ctx : JobStartContext | async move {
54+ let task = ctx. job . task_identifier ( ) ;
55+ let attempt = ctx. job . attempts ( ) ;
56+
57+ metrics:: counter!( "jobs_started" , "task" => task. clone( ) ) . increment ( 1 ) ;
58+
59+ log:: debug!( "Job started: {} (attempt {})" , task, attempt) ;
60+
61+ // Track wait time (time from creation to start)
62+ let created_at = ctx. job . created_at ( ) ;
63+ let wait_time = chrono:: Utc :: now ( ) . signed_duration_since ( * created_at) . num_milliseconds ( ) as f64 / 1000.0 ;
64+
65+ metrics:: histogram!( "job_wait_time_seconds" , "task" => task. clone( ) ) . record ( wait_time) ;
66+ } ) ;
67+
68+ hooks. on ( JobComplete , |ctx : JobCompleteContext | async move {
69+ let task = ctx. job . task_identifier ( ) ;
70+ let attempt = ctx. job . attempts ( ) ;
71+ let duration = ctx. duration . as_secs_f64 ( ) ;
72+
73+ metrics:: counter!( "jobs_completed" , "task" => task. clone( ) , "attempt" => attempt. to_string( ) ) . increment ( 1 ) ;
74+
75+ metrics:: histogram!( "job_duration_seconds" , "task" => task. clone( ) , "status" => "success" ) . record ( duration) ;
76+
77+ log:: info!(
78+ "Job completed: {} (attempt {}, duration: {:.2}s)" ,
79+ task,
80+ attempt,
81+ duration
82+ ) ;
83+ } ) ;
84+
85+ hooks. on ( JobFail , |ctx : JobFailContext | async move {
86+ let task = ctx. job . task_identifier ( ) ;
87+ let attempt = ctx. job . attempts ( ) ;
88+ let will_retry = ctx. will_retry ;
89+
90+ // Use will_retry to distinguish between transient failures and final failures
91+ let status = if will_retry { "retrying" } else { "failed" } ;
92+
93+ metrics:: counter!(
94+ "jobs_failed" ,
95+ "task" => task. clone( ) ,
96+ "attempt" => attempt. to_string( ) ,
97+ "will_retry" => status
98+ )
99+ . increment ( 1 ) ;
100+
101+ // Classify the error type for more detailed metrics
102+ let error_type = classify_error ( & ctx. error ) ;
103+ metrics:: counter!(
104+ "job_errors_by_type" ,
105+ "task" => task. clone( ) ,
106+ "error_type" => error_type
107+ )
108+ . increment ( 1 ) ;
109+
110+ log:: warn!(
111+ "Job failed: {} (attempt {}, will_retry: {}, error: {})" ,
112+ task,
113+ attempt,
114+ will_retry,
115+ ctx. error
116+ ) ;
117+ } ) ;
118+
119+ hooks. on ( JobPermanentlyFail , |ctx : JobPermanentlyFailContext | async move {
120+ let task = ctx. job . task_identifier ( ) ;
121+ let final_attempt = ctx. job . attempts ( ) ;
122+
123+ metrics:: counter!( "jobs_permanently_failed" , "task" => task. clone( ) ) . increment ( 1 ) ;
124+
125+ log:: error!(
126+ "Job permanently failed: {} (final attempt: {}, error: {})" ,
127+ task,
128+ final_attempt,
129+ ctx. error
130+ ) ;
131+ } ) ;
129132 }
130133}
131134
0 commit comments