fix: Added per rank log file for ODM (#168)

romitjain · web-flow · commit 8c990a12898d · 2026-01-30T14:34:37.000+05:30
* Added per rank log file for ODM

Signed-off-by: romit &lt;romit@ibm.com&gt;

* Pinned transformers version

Signed-off-by: romit &lt;romit@ibm.com&gt;

* Pinned transformers in framework package

* Fixed CI/CD for other packages

Signed-off-by: romit &lt;romit@ibm.com&gt;

---------

Signed-off-by: romit &lt;romit@ibm.com&gt;
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py
@@ -113,7 +113,7 @@ def save_fsdp_optimizer(
         )
     sd_options = _prepare_sd_options(fsdp_plugin)
     # get the state dicts for model and optimize
-    (model_state_dict, optimizer_state_dict) = get_state_dict(
+    model_state_dict, optimizer_state_dict = get_state_dict(
         model, optimizer, options=sd_options
     )
 
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe.py
@@ -389,7 +389,7 @@ def _maybe_scatter(
 
         # expect these products to be produced by an earlier
         # all-to-all gather call
-        (send_counts, recv_counts, bins, sorted_expert_idxs, sorted_scattered_idxs) = (
+        send_counts, recv_counts, bins, sorted_expert_idxs, sorted_scattered_idxs = (
             gather_products
         )
 
@@ -421,7 +421,7 @@ def forward(self, hidden_states: torch.Tensor):
         # compute the routing logits, weights, and expert assigments
         # - router_logits: will be passed out of forward, used for computing
         #   routing loss.
-        (router_logits, routing_weights, selected_experts) = (
+        router_logits, routing_weights, selected_experts = (
             self._compute_routing_weights(hidden_states)
         )
 
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_state_dict.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_state_dict.py
@@ -188,7 +188,7 @@ def _maybe_reshape_scattermoe_expert_weights(
     num_experts: int,
     intermediate_size: int,
 ):
-    (_is_w1, _is_w2, _is_w3) = [
+    _is_w1, _is_w2, _is_w3 = [
         f"{x}.weight" in scatter_key for x in PARAM_NAME_WEIGHT_SCATTERMOE
     ]
 
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/peft.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/peft.py
@@ -163,10 +163,8 @@ def get_gptq_peft_model(
                     model.model, model_id, adapter_name
                 )
         except Exception as exc:
-            raise NotImplementedError(
-                f"{model.__class__.__name__} not support \
-                    {peft_config.peft_type.value} peft type yet."
-            ) from exc
+            raise NotImplementedError(f"{model.__class__.__name__} not support \
+                    {peft_config.peft_type.value} peft type yet.") from exc
 
     return peft_model
 
diff --git a/plugins/accelerated-peft/tests/test_gptqmodel.py b/plugins/accelerated-peft/tests/test_gptqmodel.py
@@ -297,7 +297,5 @@ def test_quantizing_pretrained_model_outputs_match(
     target = torch.nn.functional.softmax(original_logits, dim=-1)
     target = target.view(BS * SEQLEN, -1)
     error = loss_fn(input, target)
-    assert error.lt(
-        LOSS_TOLERANCE
-    ), "Model logits don't match between both libraries \
+    assert error.lt(LOSS_TOLERANCE), "Model logits don't match between both libraries \
         after quantization"
diff --git a/plugins/framework/pyproject.toml b/plugins/framework/pyproject.toml
@@ -27,6 +27,7 @@ dependencies = [
   "peft>=0.15.0",
   "accelerate @ git+https://github.com/huggingface/accelerate.git@5998f8625b8dfde9253c241233ff13bc2c18635d",
   "pandas",
+  "transformers>=4.55.0,<=4.55.4",
 ]
 
 [tool.hatch.build.targets.wheel]
diff --git a/plugins/online-data-mixing/pyproject.toml b/plugins/online-data-mixing/pyproject.toml
@@ -30,6 +30,7 @@ dependencies = [
   "datasets==4.*",
   "torchdata==0.11.0",
   "sentence-transformers==5.*",
+  "transformers>=4.55.0,<=4.55.4",
 ]
 
 [project.optional-dependencies]
diff --git a/plugins/online-data-mixing/src/fms_acceleration_odm/odm/dataset.py b/plugins/online-data-mixing/src/fms_acceleration_odm/odm/dataset.py
@@ -142,6 +142,7 @@ def __init__(
         self.id2cat = dict(enumerate(self.category_list))
         self.cat2id = {c: i for i, c in enumerate(self.category_list)}
         self.total_categories = len(self.category_list)
+        self.rank = os.environ.get("RANK", "0")
 
         # If not starting weights given, then all arms (categories)
         # are equally important. Weights based on the size of the datasets
@@ -174,7 +175,7 @@ def __init__(
         self.output_dir = output_dir
         if not os.path.exists(self.output_dir):
             os.makedirs(self.output_dir)
-        self.log_file_path = os.path.join(self.output_dir, "odm.jsonl")
+        self.log_file_path = os.path.join(self.output_dir, f"odm_rank_{self.rank}.jsonl")
         logger.info(
             "Logs for online data mixing to be stored at {log_file_path}".format(
                 log_file_path=self.log_file_path
@@ -191,6 +192,7 @@ def __init__(
             "rewards": [0] * self.total_categories,
             "count": 0,
             "action": "",  # one of sample or update
+            "rank": self.rank,
         }
 
         # Local RNG so every process can deterministically sample identical streams.
@@ -274,6 +276,7 @@ def __next__(self):
                 "action": "sample",
             }
         )
+
         return sample
 
     def load_state_dict(self, state_dict):
@@ -548,13 +551,12 @@ def update_sampling_weights(self, model, accelerator, state):
             count = accelerator.reduce(count, reduction="sum")
 
         self._update_weights(count, rewards)
-        if accelerator and accelerator.is_main_process:
-            self.log_to_file(
-                {
-                    "current_sampling_weights": self.sampling_weights.tolist(),
-                    "current_sampling_ratio": self.sampling_ratio,
-                    "rewards": rewards.tolist(),
-                    "count": count.tolist(),
-                    "action": "update",
-                }
-            )
+        self.log_to_file(
+            {
+                "current_sampling_weights": self.sampling_weights.tolist(),
+                "current_sampling_ratio": self.sampling_ratio,
+                "rewards": rewards.tolist(),
+                "count": count.tolist(),
+                "action": "update",
+            }
+        )

Original file line number	Diff line number	Diff line change
`@@ -113,7 +113,7 @@ def save_fsdp_optimizer(`
`113`	`113`	`)`
`114`	`114`	`sd_options = _prepare_sd_options(fsdp_plugin)`
`115`	`115`	`# get the state dicts for model and optimize`
`116`		`- (model_state_dict, optimizer_state_dict) = get_state_dict(`
	`116`	`+ model_state_dict, optimizer_state_dict = get_state_dict(`
`117`	`117`	`model, optimizer, options=sd_options`
`118`	`118`	`)`
`119`	`119`
Original file line number	Diff line number	Diff line change
`@@ -389,7 +389,7 @@ def _maybe_scatter(`
`389`	`389`
`390`	`390`	`# expect these products to be produced by an earlier`
`391`	`391`	`# all-to-all gather call`
`392`		`- (send_counts, recv_counts, bins, sorted_expert_idxs, sorted_scattered_idxs) = (`
	`392`	`+ send_counts, recv_counts, bins, sorted_expert_idxs, sorted_scattered_idxs = (`
`393`	`393`	`gather_products`
`394`	`394`	`)`
`395`	`395`
`@@ -421,7 +421,7 @@ def forward(self, hidden_states: torch.Tensor):`
`421`	`421`	`# compute the routing logits, weights, and expert assigments`
`422`	`422`	`# - router_logits: will be passed out of forward, used for computing`
`423`	`423`	`# routing loss.`
`424`		`- (router_logits, routing_weights, selected_experts) = (`
	`424`	`+ router_logits, routing_weights, selected_experts = (`
`425`	`425`	`self._compute_routing_weights(hidden_states)`
`426`	`426`	`)`
`427`	`427`
Original file line number	Diff line number	Diff line change
`@@ -188,7 +188,7 @@ def _maybe_reshape_scattermoe_expert_weights(`
`188`	`188`	`num_experts: int,`
`189`	`189`	`intermediate_size: int,`
`190`	`190`	`):`
`191`		`- (_is_w1, _is_w2, _is_w3) = [`
	`191`	`+ _is_w1, _is_w2, _is_w3 = [`
`192`	`192`	`f"{x}.weight" in scatter_key for x in PARAM_NAME_WEIGHT_SCATTERMOE`
`193`	`193`	`]`
`194`	`194`
Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,7 @@ dependencies = [`
`27`	`27`	`"peft>=0.15.0",`
`28`	`28`	`"accelerate @ git+https://github.com/huggingface/accelerate.git@5998f8625b8dfde9253c241233ff13bc2c18635d",`
`29`	`29`	`"pandas",`
	`30`	`+ "transformers>=4.55.0,<=4.55.4",`
`30`	`31`	`]`
`31`	`32`
`32`	`33`	`[tool.hatch.build.targets.wheel]`
Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@ dependencies = [`
`30`	`30`	`"datasets==4.*",`
`31`	`31`	`"torchdata==0.11.0",`
`32`	`32`	`"sentence-transformers==5.*",`
	`33`	`+ "transformers>=4.55.0,<=4.55.4",`
`33`	`34`	`]`
`34`	`35`
`35`	`36`	`[project.optional-dependencies]`