@@ -96,6 +96,29 @@ def small_training_mamba_finetune_cmd(
9696 return cmd
9797
9898
99+ def small_training_llama_cmd (path , max_steps , val_check , devices : int = 1 , additional_args : str = "" ):
100+ cmd = (
101+ f"train_evo2 --no-fp32-residual-connection --mock-data --result-dir { path } --devices { devices } "
102+ "--model-size 8B --num-layers 2 --limit-val-batches 1 "
103+ "--no-activation-checkpointing --create-tensorboard-logger --create-tflops-callback "
104+ f"--max-steps { max_steps } --warmup-steps 1 --val-check-interval { val_check } --limit-val-batches 1 "
105+ f"--seq-length 8 --hidden-dropout 0.1 --attention-dropout 0.1 { additional_args } "
106+ )
107+ return cmd
108+
109+
110+ def small_training_llama_finetune_cmd (
111+ path , max_steps , val_check , prev_ckpt , devices : int = 1 , additional_args : str = ""
112+ ):
113+ cmd = (
114+ f"train_evo2 --no-fp32-residual-connection --mock-data --result-dir { path } --devices { devices } "
115+ "--model-size 8B --num-layers 2 --limit-val-batches 1 "
116+ "--no-activation-checkpointing --create-tensorboard-logger --create-tflops-callback "
117+ f"--max-steps { max_steps } --warmup-steps 1 --val-check-interval { val_check } --limit-val-batches 1 "
118+ f"--seq-length 16 --hidden-dropout 0.1 --attention-dropout 0.1 { additional_args } --ckpt-dir { prev_ckpt } "
119+ )
120+ return cmd
121+
99122@pytest .mark .timeout (512 ) # Optional: fail if the test takes too long.
100123@pytest .mark .slow
101124def test_train_evo2_finetune_runs (tmp_path ):
@@ -243,6 +266,80 @@ def test_train_evo2_mamba_finetune_runs(tmp_path):
243266 assert len (matching_subfolders_ft ) == 1 , "Only one checkpoint subfolder should be found."
244267
245268
269+
270+ @pytest .mark .timeout (512 ) # Optional: fail if the test takes too long.
271+ @pytest .mark .slow
272+ def test_train_evo2_llama_finetune_runs (tmp_path ):
273+ """
274+ This test runs the `train_evo2` command with mock data in a temporary directory using Llama model.
275+ It uses the temporary directory provided by pytest as the working directory.
276+ The command is run in a subshell, and we assert that it returns an exit code of 0.
277+ """
278+ num_steps = 2
279+ # Note: The command assumes that `train_evo2` is in your PATH.
280+ command = small_training_llama_cmd (tmp_path / "pretrain" , max_steps = num_steps , val_check = num_steps )
281+ stdout_pretrain : str = run_command_in_subprocess (command = command , path = str (tmp_path ))
282+ assert "Restoring model weights from RestoreConfig(path='" not in stdout_pretrain
283+
284+ log_dir = tmp_path / "pretrain" / "evo2"
285+ checkpoints_dir = log_dir / "checkpoints"
286+ tensorboard_dir = log_dir / "dev"
287+
288+ # Check if logs dir exists
289+ assert log_dir .exists (), "Logs folder should exist."
290+ # Check if checkpoints dir exists
291+ assert checkpoints_dir .exists (), "Checkpoints folder does not exist."
292+
293+ expected_checkpoint_suffix = f"{ num_steps } .0-last"
294+ # Check if any subfolder ends with the expected suffix
295+ matching_subfolders = [
296+ p for p in checkpoints_dir .iterdir () if p .is_dir () and (expected_checkpoint_suffix in p .name )
297+ ]
298+
299+ assert matching_subfolders , (
300+ f"No checkpoint subfolder ending with '{ expected_checkpoint_suffix } ' found in { checkpoints_dir } ."
301+ )
302+
303+ # Check if directory with tensorboard logs exists
304+ assert tensorboard_dir .exists (), "TensorBoard logs folder does not exist."
305+ # Recursively search for files with tensorboard logger
306+ event_files = list (tensorboard_dir .rglob ("events.out.tfevents*" ))
307+ assert event_files , f"No TensorBoard event files found under { tensorboard_dir } "
308+
309+ assert len (matching_subfolders ) == 1 , "Only one checkpoint subfolder should be found."
310+ command_finetune = small_training_llama_finetune_cmd (
311+ tmp_path / "finetune" , max_steps = num_steps , val_check = num_steps , prev_ckpt = matching_subfolders [0 ]
312+ )
313+ stdout_finetune : str = run_command_in_subprocess (command = command_finetune , path = str (tmp_path ))
314+ assert "Restoring model weights from RestoreConfig(path='" in stdout_finetune
315+
316+ log_dir_ft = tmp_path / "finetune" / "evo2"
317+ checkpoints_dir_ft = log_dir_ft / "checkpoints"
318+ tensorboard_dir_ft = log_dir_ft / "dev"
319+
320+ # Check if logs dir exists
321+ assert log_dir_ft .exists (), "Logs folder should exist."
322+ # Check if checkpoints dir exists
323+ assert checkpoints_dir_ft .exists (), "Checkpoints folder does not exist."
324+
325+ expected_checkpoint_suffix = f"{ num_steps } .0-last"
326+ matching_subfolders_ft = [
327+ p for p in checkpoints_dir_ft .iterdir () if p .is_dir () and (expected_checkpoint_suffix in p .name )
328+ ]
329+
330+ assert matching_subfolders_ft , (
331+ f"No checkpoint subfolder ending with '{ expected_checkpoint_suffix } ' found in { checkpoints_dir_ft } ."
332+ )
333+
334+ # Check if directory with tensorboard logs exists
335+ assert tensorboard_dir_ft .exists (), "TensorBoard logs folder does not exist."
336+ # Recursively search for files with tensorboard logger
337+ event_files = list (tensorboard_dir_ft .rglob ("events.out.tfevents*" ))
338+ assert event_files , f"No TensorBoard event files found under { tensorboard_dir_ft } "
339+
340+ assert len (matching_subfolders_ft ) == 1 , "Only one checkpoint subfolder should be found."
341+
342+
246343@pytest .mark .timeout (256 ) # Optional: fail if the test takes too long.
247344@pytest .mark .slow
248345def test_train_evo2_stops (tmp_path ):
0 commit comments