[FIX] Support NavDP finetuning

wzcai99 · wzcai99 · commit 1545c203fe81 · 2025-11-05T03:46:46.000Z
diff --git a/internnav/dataset/navdp_dataset_lerobot.py b/internnav/dataset/navdp_dataset_lerobot.py
@@ -511,6 +511,10 @@ def __getitem__(self, index):
             camera_intrinsic,
             trajectory_base_extrinsic,
         )
+        # pixel channel == 7 represents the navdp works pixel navigation under asynchronous pace,
+        # pixel_mask (1), the history image with the assigned pixel goal (3), current image (3)
+        # if pixel_channel == 4, pixel goal is assigned at current frame, therefore,
+        # only pixel_mask (1) and current image (3) are needed
         if self.pixel_channel == 7:
             pixel_goal = np.concatenate((pixel_goal, memory_images[-1]), axis=-1)
 
diff --git a/internnav/model/basemodel/navdp/navdp_policy.py b/internnav/model/basemodel/navdp/navdp_policy.py
@@ -7,13 +7,15 @@
 
 from internnav.configs.model.base_encoders import ModelCfg
 from internnav.configs.trainer.exp import ExpCfg
+from internnav.model.encoder.navdp_backbone import LearnablePositionalEncoding
 from internnav.model.encoder.navdp_backbone import (
-    LearnablePositionalEncoding,
-    NavDP_ImageGoal_Backbone,
-    NavDP_PixelGoal_Backbone,
-    NavDP_RGBD_Backbone,
-    SinusoidalPosEmb,
+    NavDP_ImageGoal_Backbone as ImageGoal_Backbone,
 )
+from internnav.model.encoder.navdp_backbone import (
+    NavDP_PixelGoal_Backbone as PixelGoal_Backbone,
+)
+from internnav.model.encoder.navdp_backbone import NavDP_RGBD_Backbone as RGBD_Backbone
+from internnav.model.encoder.navdp_backbone import SinusoidalPosEmb
 
 
 class NavDPModelConfig(PretrainedConfig):
@@ -83,13 +85,13 @@ def __init__(self, config: NavDPModelConfig):
         self.token_dim = self.config.model_cfg['il']['token_dim']
         self.scratch = self.config.model_cfg['il']['scratch']
         self.finetune = self.config.model_cfg['il']['finetune']
-        self.rgbd_encoder = NavDP_RGBD_Backbone(
+        self.rgbd_encoder = RGBD_Backbone(
             self.image_size, self.token_dim, memory_size=self.memory_size, finetune=self.finetune, device=self._device
         )
-        self.pixel_encoder = NavDP_PixelGoal_Backbone(
+        self.pixel_encoder = PixelGoal_Backbone(
             self.image_size, self.token_dim, pixel_channel=self.pixel_channel, device=self._device
         )
-        self.image_encoder = NavDP_ImageGoal_Backbone(self.image_size, self.token_dim, device=self._device)
+        self.image_encoder = ImageGoal_Backbone(self.image_size, self.token_dim, device=self._device)
         self.point_encoder = nn.Linear(3, self.token_dim)
 
         if not self.finetune:
@@ -185,23 +187,6 @@ def predict_critic(self, predict_trajectory, rgbd_embed):
         return critic_output
 
     def forward(self, goal_point, goal_image, goal_pixel, input_images, input_depths, output_actions, augment_actions):
-        # """get device safely"""
-        # # get device safely
-        # try:
-        #     # try to get device through model parameters
-        #     device = next(self.parameters()).device
-        # except StopIteration:
-        #     # model has no parameters, use the default device
-        #     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        # # move all inputs to model device
-        # goal_point = goal_point.to(device)
-        # goal_image = goal_image.to(device)
-        # input_images = input_images.to(device)
-        # input_depths = input_depths.to(device)
-        # output_actions = output_actions.to(device)
-        # augment_actions = augment_actions.to(device)
-        # device = self._device
-        # print(f"self.parameters() is:{self.parameters()}")
         device = next(self.parameters()).device
 
         assert input_images.shape[1] == self.memory_size
@@ -330,7 +315,6 @@ def predict_pointgoal_batch_action_vel(self, goal_point, input_images, input_dep
                 naction = self.noise_scheduler.step(model_output=noise_pred, timestep=k, sample=naction).prev_sample
 
             critic_values = self.predict_critic(naction, rgbd_embed)
-            # all_trajectory = torch.cumsum(naction / 4.0, dim=1)
 
             negative_trajectory = torch.cumsum(naction / 4.0, dim=1)[(critic_values).argsort()[0:8]]
             positive_trajectory = torch.cumsum(naction / 4.0, dim=1)[(-critic_values).argsort()[0:8]]
@@ -349,12 +333,7 @@ def predict_nogoal_batch_action_vel(self, input_images, input_depths, sample_num
                 naction = self.noise_scheduler.step(model_output=noise_pred, timestep=k, sample=naction).prev_sample
 
             critic_values = self.predict_critic(naction, rgbd_embed)
-            # all_trajectory = torch.cumsum(naction / 4.0, dim=1)
 
             negative_trajectory = torch.cumsum(naction / 4.0, dim=1)[(critic_values).argsort()[0:8]]
             positive_trajectory = torch.cumsum(naction / 4.0, dim=1)[(-critic_values).argsort()[0:8]]
             return negative_trajectory, positive_trajectory
-
-
-# if __name__ == "__main__":
-#     policy = NavDPNet(config=)
diff --git a/scripts/train/train.py b/scripts/train/train.py
@@ -103,7 +103,6 @@ def main(config, model_class, model_config_class):
             # Initialize distributed training environment
             if world_size > 1:
                 try:
-                    dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank)
                     dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank)
                     print("Distributed initialization SUCCESS")
                 except Exception as e: