@@ -300,6 +300,100 @@ func TestAllocRunner_PreStartFailuresLeadToFailed(t *testing.T) {
300300 }
301301}
302302
303+ // TestAllocRunner_Restore_VolatileAllocDir asserts that restoring a running
304+ // system alloc rebuilds task directories after client.alloc_dir was wiped,
305+ // while state in data_dir persists.
306+ func TestAllocRunner_Restore_VolatileAllocDir (t * testing.T ) {
307+ ci .Parallel (t )
308+
309+ alloc := mock .Alloc ()
310+ alloc .Job .Type = structs .JobTypeSystem
311+ taskName := alloc .Job .TaskGroups [0 ].Tasks [0 ].Name
312+ alloc .Job .TaskGroups [0 ].Tasks [0 ].Driver = "mock_driver"
313+ alloc .Job .TaskGroups [0 ].Tasks [0 ].Config = map [string ]interface {}{
314+ "run_for" : "1h" ,
315+ }
316+
317+ countTaskDirBuildEvents := func (ar * allocRunner ) int {
318+ state := ar .AllocState ()
319+ ts , ok := state .TaskStates [taskName ]
320+ if ! ok || ts == nil {
321+ return 0
322+ }
323+
324+ count := 0
325+ for _ , event := range ts .Events {
326+ if event .Type == structs .TaskSetup && event .Message == structs .TaskBuildingTaskDir {
327+ count ++
328+ }
329+ }
330+
331+ return count
332+ }
333+
334+ conf , cleanup := testAllocRunnerConfig (t , alloc .Copy ())
335+ defer cleanup ()
336+
337+ // Persist task hook state across runner restarts to simulate data_dir.
338+ conf .StateDB = state .NewMemDB (conf .Logger )
339+
340+ arIface , err := NewAllocRunner (conf )
341+ must .NoError (t , err )
342+ ar := arIface .(* allocRunner )
343+
344+ go ar .Run ()
345+ defer destroy (ar )
346+
347+ testutil .WaitForResult (func () (bool , error ) {
348+ if countTaskDirBuildEvents (ar ) == 0 {
349+ return false , fmt .Errorf ("expected task dir build hook to run on initial run" )
350+ }
351+ return true , nil
352+ }, func (err error ) {
353+ require .NoError (t , err )
354+ })
355+
356+ initialTaskDirBuildEvents := countTaskDirBuildEvents (ar )
357+ require .Greater (t , initialTaskDirBuildEvents , 0 )
358+ allocDirPath := ar .GetAllocDir ().AllocDirPath ()
359+
360+ // Stop the runner and simulate host reboot where client.alloc_dir is wiped.
361+ ar .Shutdown ()
362+ select {
363+ case <- ar .ShutdownCh ():
364+ case <- time .After (30 * time .Second ):
365+ require .Fail (t , "AR took too long to exit" )
366+ }
367+
368+ require .NoError (t , os .RemoveAll (allocDirPath ))
369+
370+ conf2 , cleanup2 := testAllocRunnerConfig (t , alloc .Copy ())
371+ defer cleanup2 ()
372+
373+ // Reuse persistent state and the same alloc storage path after restart.
374+ conf2 .StateDB = conf .StateDB
375+ conf2 .ClientConfig .AllocDir = conf .ClientConfig .AllocDir
376+ conf2 .ClientConfig .AllocMountsDir = conf .ClientConfig .AllocMountsDir
377+
378+ ar2Iface , err := NewAllocRunner (conf2 )
379+ must .NoError (t , err )
380+ ar2 := ar2Iface .(* allocRunner )
381+
382+ must .NoError (t , ar2 .Restore ())
383+
384+ go ar2 .Run ()
385+ defer destroy (ar2 )
386+
387+ testutil .WaitForResult (func () (bool , error ) {
388+ if countTaskDirBuildEvents (ar2 ) <= initialTaskDirBuildEvents {
389+ return false , fmt .Errorf ("expected task dir build hook to rerun on restore run" )
390+ }
391+ return true , nil
392+ }, func (err error ) {
393+ require .NoError (t , err )
394+ })
395+ }
396+
303397type allocFailingPrestartHook struct {}
304398
305399func (* allocFailingPrestartHook ) Name () string { return "failing_prestart" }
0 commit comments