[tunix/sft] Avoid eager re-sharding of globally distributed arrays

gagika · Google-ML-Automation · commit b341212cee88 · 2026-04-19T14:24:30.000-07:00
Updates `PeftTrainer` to supply `data_sharding_axis` explicitly in `train_distill.py` to match MaxTexts native sharding axis.

Additionally adds a robust check in `sharding_utils.shard_input` to skip re-sharding of fully global un-addressable arrays to prevent TPU memory addressable errors.

PiperOrigin-RevId: 902282275
diff --git a/tests/post_training/unit/train_distill_test.py b/tests/post_training/unit/train_distill_test.py
@@ -1027,6 +1027,7 @@ def test_main_offline_mode_skips_teacher_loading(
     mock_student_cfg.eval_interval = -1
     mock_student_cfg.gradient_accumulation_steps = 1
     mock_student_cfg.global_batch_size = 8
+    mock_student_cfg.data_sharding = ("fsdp",)
 
     # Add dummy numbers for strategy math/logic
     mock_student_cfg.distill_temperature = 1.0
@@ -1116,6 +1117,7 @@ def test_main_online_mode_loads_teacher(
     mock_student_cfg.eval_interval = -1
     mock_student_cfg.gradient_accumulation_steps = 1
     mock_student_cfg.global_batch_size = 8
+    mock_student_cfg.data_sharding = ("fsdp",)
 
     # Add dummy numbers for strategy math/logic
     mock_student_cfg.distill_temperature = 1.0