fix: fix some descriptions in comments

Chamberlain0w0 · Chamberlain0w0 · commit 477a198da2ba · 2026-05-14T01:22:01.000Z
diff --git a/infini_train/src/nn/parallel/ddp/distributed_optimizer.cc b/infini_train/src/nn/parallel/ddp/distributed_optimizer.cc
@@ -79,12 +79,9 @@ void DistributedOptimizer::BuildShardParamsAndBindGrads() {
                                                            std::vector<int64_t>{static_cast<int64_t>(piece_numel)});
 
                 param_piece->set_grad(grad_piece);
-                // if (use_grad_shard) {
-                //     // NOTE(zbl): Under ZeRO-2, param->grad() is the shard of grad, not the full grad.
-                //     //            The binding is done in the construnctor of DistributedOptimizer.
-                //     //            Not until backward is finished, the value of param->grad() will be updated.
-                //     param->set_grad(grad_piece);
-                // }
+                // NOTE(zbl): Do not call `param->set_grad(grad_piece);` under ZeRO-2.
+                //            The base optimizer updates param_piece views only; original param->grad()
+                //            would be a partial flattened shard and does not represent the full parameter grad.
                 shard_params_.push_back(param_piece);
             }
         }
@@ -135,7 +132,7 @@ void DistributedOptimizer::Step() {
 
     // 3. Gather updated param shards back to full params
     StartParamSync(/*force_sync=*/false);
-    // FIXME(zbl): Call sync before param is actually used in next step
+    // TODO(zbl): Delay sync call until param is actually used in next step
     FinishParamSync(/*skip_next_bucket_dispatch=*/true);
 }
 
diff --git a/infini_train/src/nn/parallel/ddp/param_and_grad_buffer.cc b/infini_train/src/nn/parallel/ddp/param_and_grad_buffer.cc
@@ -86,7 +86,7 @@ void ParamAndGradBucket::ScaleGradients(float scaling_factor) {
 
     // FIXME(zbl): should perform in-place multiply
     // grad_data_ *= scaling_factor;
-    LOG(FATAL) << "ParamAndGradBucket: Should not arrive here";
+    LOG(FATAL) << "ParamAndGradBuffer::ScaleGradients(): Inplace multiply not implemented yet.";
 }
 
 ParamAndGradBucketGroup::ParamAndGradBucketGroup(const std::vector<std::shared_ptr<ParamAndGradBucket>> &buckets,
@@ -107,8 +107,7 @@ ParamAndGradBucketGroup::ParamAndGradBucketGroup(const std::vector<std::shared_p
     }
     if (rank_in_collective_pg_ == -1) {
         auto param = *params_.begin();
-        // FIXME(zbl): get correct rank in multi-node settings
-        rank_in_collective_pg_ = collective_pg_->GetGroupRank(param->GetDevice().Rank().thread_rank());
+        rank_in_collective_pg_ = collective_pg_->GetGroupRank(param->GetDevice().Rank().GlobalRank());
     }
 
     param_buffer_shard_list_.resize(buckets_.size());
@@ -168,9 +167,7 @@ void ParamAndGradBucketGroup::RegisterGradReady(const std::shared_ptr<Tensor> &p
         // TODO(zbl): check this if sync is only done in last mircobatch
         // if (!inserted) {
         //     LOG(FATAL) << "ParamAndGradBucketGroup: RegisterGradReady() was called twice for the same parameter in a
-        //     "
-        //                   "bucket group.";
-        //     return;
+        //     bucket group."; return;
         // }
 
         if (params_with_grad_.size() == params_.size()) {
@@ -304,7 +301,7 @@ void ParamAndGradBucketGroup::StartGradSync() {
     }
 
     grad_reduce_dispatched_ = true;
-    // FIXME(zbl): no need to clear params_with_grad_ here if grad sync is only done on last microbatch
+    // TODO(zbl): no need to clear params_with_grad_ here if grad sync is only done on last microbatch
     params_with_grad_.clear();
 }
 
@@ -637,7 +634,7 @@ void ParamAndGradBuffer::ScaleGradients(float scaling_factor) {
 
     // FIXME(zbl): should perform in-place multiply
     // grad_data_ *= scaling_factor;
-    LOG(FATAL) << "Should not arrive here";
+    LOG(FATAL) << "ParamAndGradBuffer::ScaleGradients(): Inplace multiply not implemented yet.";
 }
 
 void ParamAndGradBuffer::Reset(bool need_rebind) {