@@ -118,87 +118,92 @@ export default function TrainingDetailPage() {
118118 return Math . max ( 0 , Math . floor ( ( end - start ) / 1000 ) ) ;
119119 } , [ ] ) ;
120120
121- useEffect ( ( ) => {
122- let cancelled = false ;
123-
124- async function fetchAll ( ) {
125- try {
126- const jobRes = await api . get < Job > ( `/training/${ jobId } ` ) ;
127- if ( cancelled ) return ;
128- setJob ( jobRes ) ;
129- setElapsedSec ( computeElapsed ( jobRes . started_at , jobRes . completed_at ) ) ;
130-
131- // Fetch metrics and artifacts in parallel
132- const [ metricsRes , artifactsRes ] = await Promise . all ( [
133- api . get < MetricRecord [ ] > ( `/training/${ jobId } /metrics` ) . catch ( ( ) => [ ] as MetricRecord [ ] ) ,
134- api . get < Artifact [ ] > ( `/jobs/${ jobId } /artifacts` ) . catch ( ( ) => [ ] as Artifact [ ] ) ,
135- ] ) ;
136-
137- if ( cancelled ) return ;
138-
139- // Split metrics by metric_name
140- const loss : { name : string ; value : number } [ ] = [ ] ;
141- const acc : { name : string ; value : number } [ ] = [ ] ;
142-
143- if ( metricsRes && metricsRes . length > 0 ) {
144- for ( const record of metricsRes ) {
145- const point = {
146- name : ( record . step ?? record . epoch ?? 0 ) . toString ( ) ,
147- value : record . value ,
148- } ;
149- if ( record . metric_name === "loss" ) {
150- loss . push ( point ) ;
151- } else if ( record . metric_name === "accuracy" ) {
152- acc . push ( point ) ;
153- }
121+ const fetchAll = useCallback ( async ( isInitial = false ) => {
122+ try {
123+ const jobRes = await api . get < Job > ( `/training/${ jobId } ` ) ;
124+ setJob ( jobRes ) ;
125+ setElapsedSec ( computeElapsed ( jobRes . started_at , jobRes . completed_at ) ) ;
126+
127+ // Fetch metrics and artifacts in parallel
128+ const [ metricsRes , artifactsRes ] = await Promise . all ( [
129+ api . get < MetricRecord [ ] > ( `/training/${ jobId } /metrics` ) . catch ( ( ) => [ ] as MetricRecord [ ] ) ,
130+ api . get < Artifact [ ] > ( `/jobs/${ jobId } /artifacts` ) . catch ( ( ) => [ ] as Artifact [ ] ) ,
131+ ] ) ;
132+
133+ // Split metrics by metric_name
134+ const loss : { name : string ; value : number } [ ] = [ ] ;
135+ const acc : { name : string ; value : number } [ ] = [ ] ;
136+
137+ if ( metricsRes && metricsRes . length > 0 ) {
138+ for ( const record of metricsRes ) {
139+ const point = {
140+ name : ( record . step ?? record . epoch ?? 0 ) . toString ( ) ,
141+ value : record . value ,
142+ } ;
143+ if ( record . metric_name === "loss" ) {
144+ loss . push ( point ) ;
145+ } else if ( record . metric_name === "accuracy" ) {
146+ acc . push ( point ) ;
154147 }
155148 }
149+ }
156150
157- setLossData ( loss ) ;
158- setAccData ( acc ) ;
159- setArtifacts ( artifactsRes ?? [ ] ) ;
160- } catch ( err ) {
161- if ( ! cancelled ) {
162- toast . error ( err instanceof Error ? err . message : "Failed to load training job" ) ;
163-
164- // Set a fallback job so the full UI always renders (e.g. for E2E tests)
165- const fallbackJob : Job = {
166- id : jobId ,
167- project_id : "" ,
168- model_id : "" ,
169- dataset_id : null ,
170- job_type : "Training Job" ,
171- status : "unknown" ,
172- k8s_job_name : null ,
173- hardware_tier : "N/A" ,
174- hyperparameters : { } ,
175- metrics : null ,
176- started_at : null ,
177- completed_at : null ,
178- error_message : null ,
179- created_by : "" ,
180- created_at : new Date ( ) . toISOString ( ) ,
181- updated_at : new Date ( ) . toISOString ( ) ,
182- progress : 0 ,
183- epoch_current : null ,
184- epoch_total : null ,
185- loss : null ,
186- learning_rate : null ,
187- gpu_config : null ,
188- } ;
189- setJob ( fallbackJob ) ;
190- setIsFallback ( true ) ;
191- setElapsedSec ( 0 ) ;
192- }
193- } finally {
194- if ( ! cancelled ) setLoading ( false ) ;
151+ setLossData ( loss ) ;
152+ setAccData ( acc ) ;
153+ setArtifacts ( artifactsRes ?? [ ] ) ;
154+ } catch ( err ) {
155+ if ( isInitial ) {
156+ toast . error ( err instanceof Error ? err . message : "Failed to load training job" ) ;
157+
158+ // Set a fallback job so the full UI always renders (e.g. for E2E tests)
159+ const fallbackJob : Job = {
160+ id : jobId ,
161+ project_id : "" ,
162+ model_id : "" ,
163+ dataset_id : null ,
164+ job_type : "Training Job" ,
165+ status : "unknown" ,
166+ k8s_job_name : null ,
167+ hardware_tier : "N/A" ,
168+ hyperparameters : { } ,
169+ metrics : null ,
170+ started_at : null ,
171+ completed_at : null ,
172+ error_message : null ,
173+ created_by : "" ,
174+ created_at : new Date ( ) . toISOString ( ) ,
175+ updated_at : new Date ( ) . toISOString ( ) ,
176+ progress : 0 ,
177+ epoch_current : null ,
178+ epoch_total : null ,
179+ loss : null ,
180+ learning_rate : null ,
181+ gpu_config : null ,
182+ } ;
183+ setJob ( fallbackJob ) ;
184+ setIsFallback ( true ) ;
185+ setElapsedSec ( 0 ) ;
195186 }
187+ } finally {
188+ if ( isInitial ) setLoading ( false ) ;
196189 }
197-
198- fetchAll ( ) ;
199- return ( ) => { cancelled = true ; } ;
200190 } , [ jobId , computeElapsed ] ) ;
201191
192+ // Initial fetch
193+ useEffect ( ( ) => {
194+ fetchAll ( true ) ;
195+ } , [ fetchAll ] ) ;
196+
197+ // Poll job + metrics every 3s while job is active
198+ useEffect ( ( ) => {
199+ if ( ! job ) return ;
200+ const isActive = job . status === "running" || job . status === "pending" ;
201+ if ( ! isActive ) return ;
202+
203+ const t = setInterval ( ( ) => fetchAll ( false ) , 3000 ) ;
204+ return ( ) => clearInterval ( t ) ;
205+ } , [ job ?. status , fetchAll ] ) ;
206+
202207 // Tick elapsed timer every second while job is running
203208 useEffect ( ( ) => {
204209 if ( ! job ) return ;
0 commit comments