InternRobotics
diff --git a/‎.gitmodules‎
Lines changed: 4 additions & 4 deletions b/‎.gitmodules‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/changelog.md‎
Lines changed: 18 additions & 0 deletions b/‎docs/changelog.md‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎docs/compatibility.md‎
Lines changed: 12 additions & 0 deletions b/‎docs/compatibility.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎internnav/dataset/cma_lerobot_dataset.py‎
Lines changed: 9 additions & 12 deletions b/‎internnav/dataset/cma_lerobot_dataset.py‎
Lines changed: 9 additions & 12 deletions
diff --git a/‎internnav/dataset/rdp_lerobot_dataset.py‎
Lines changed: 4 additions & 3 deletions b/‎internnav/dataset/rdp_lerobot_dataset.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎internnav/habitat_extensions/vln/habitat_vln_evaluator.py‎
Lines changed: 59 additions & 1 deletion b/‎internnav/habitat_extensions/vln/habitat_vln_evaluator.py‎
Lines changed: 59 additions & 1 deletion
@@ -1,9 +1,9 @@
-[submodule "diffusion-policy"]
-	path = src/diffusion-policy
-	url = https://github.com/real-stanford/diffusion_policy.git
-	commit = 5ba07ac6661db573af695b419a7947ecb704690f
 [submodule "internnav/model/basemodel/LongCLIP"]
 	path = internnav/model/basemodel/LongCLIP
 	url = https://github.com/beichenzbc/Long-CLIP
 	commit = 3966af9ae9331666309a22128468b734db4672a7
 	ignore = untracked
+[submodule "third_party/diffusion-policy"]
+	path = third_party/diffusion-policy
+	url = https://github.com/real-stanford/diffusion_policy.git
+	commit = 5ba07ac6661db573af695b419a7947ecb704690f
@@ -6,6 +6,24 @@ All notable changes to this project will be documented in this file.
 
 Upcoming changes will be tracked in this section.
 
+## Changelog of v0.3.1 (2026/02/09)
+### Highlights
+- Update dataset conversion for InternData-N1 VLN-PE v0.5 format (#288)
+- Support `vis_debug` option for Habitat evaluation (#265)
+
+### New Features
+- Support `vis_debug` option for Habitat evaluation (#265)
+
+### Improvements
+- Update submodule path from `src` to `third_party` (#266)
+
+### Bug Fixes
+- Fix VLLN path handling (#228)
+- Fix dataset conversion for InternData-N1 VLN-PE v0.5 format (#288)
+
+Full Changelog: https://github.com/InternRobotics/InternNav/compare/release/v0.3.0...release/v0.3.1
+
+
 ## Changelog of v0.3.0 (2026/01/05)
 ### Highlights
 - Support training of InternVLA-N1 and evaluation on RxR (#184)
 
@@ -0,0 +1,12 @@
+# Compatibility
+
+## v0.3.1
+
+### InternData-N1 update to v0.5
+
+The InternData-N1 VLN-PE trajectory training dataset has been upgraded from `v0.1` to `v0.5`. This update introduces minor structural changes in the dataset layout and updates the LeRobot-to-LMDB conversion logic to match the new `v0.5` data structure.
+
+The training pipeline now uses the new key name:
+- `instruction_text` → `task`
+
+The updated conversion logic is **not compatible** with InternData-N1 `v0.1`.
@@ -6,7 +6,7 @@
 
 from internnav.dataset.base import BaseDataset, ObservationsDict, _block_shuffle
 from internnav.model.utils.feature_extract import extract_instruction_tokens
-from internnav.utils.lerobot_as_lmdb import LerobotAsLmdb
+from internnav.utils.loader import LerobotAsLmdb
 
 
 class CMALerobotDataset(BaseDataset):
@@ -38,8 +38,9 @@ def __init__(
         self.camera_name = self.config.il.camera_name
 
         self.lerobot_as_lmdb = LerobotAsLmdb(self.lerobot_features_dir)
-        self.lmdb_keys = self.lerobot_as_lmdb.get_all_keys()
+        self.lmdb_keys = self.lerobot_as_lmdb.get_all_keys(allow_scan_list=['r2r'])  # r2r / r2r_aliengo / r2r_flash
         self.length = len(self.lmdb_keys)
+        print(f"total keys in traj_data: {len(self.lmdb_keys)}")
 
         # For CMA-CLIP
         self.use_clip_encoders = False
@@ -105,14 +106,12 @@ def _load_next(self):
                         data['camera_info'][self.camera_name]['rgb'] = data['camera_info'][self.camera_name]['rgb'][
                             :-drop_last_frame_nums
                         ]
-                        data['camera_info'][self.camera_name]['depth'] = data['camera_info'][self.camera_name][
-                            'depth'
-                        ][:-drop_last_frame_nums]
-                        data['robot_info']['yaw'] = data['robot_info']['yaw'][:-drop_last_frame_nums]
-                        data['robot_info']['position'] = data['robot_info']['position'][:-drop_last_frame_nums]
-                        data['robot_info']['orientation'] = data['robot_info']['orientation'][
+                        data['camera_info'][self.camera_name]['depth'] = data['camera_info'][self.camera_name]['depth'][
                             :-drop_last_frame_nums
                         ]
+                        data['robot_info']['yaw'] = data['robot_info']['yaw'][:-drop_last_frame_nums]
+                        data['robot_info']['position'] = data['robot_info']['position'][:-drop_last_frame_nums]
+                        data['robot_info']['orientation'] = data['robot_info']['orientation'][:-drop_last_frame_nums]
                         data['progress'] = data['progress'][:-drop_last_frame_nums]
                         data['step'] = data['step'][:-drop_last_frame_nums]
                         if 'rgb_features' in data.keys():
@@ -132,13 +131,11 @@ def _load_next(self):
 
                     if self.bert_tokenizer is not None:
                         instructions = [
-                            episodes_in_json[ep_idx]['instruction_text']
-                            for ep_idx in range(len(episodes_in_json))
+                            episodes_in_json[ep_idx]['instruction_text'] for ep_idx in range(len(episodes_in_json))
                         ]
                     else:
                         instructions = [
-                            episodes_in_json[ep_idx]['instruction_tokens']
-                            for ep_idx in range(len(episodes_in_json))
+                            episodes_in_json[ep_idx]['instruction_tokens'] for ep_idx in range(len(episodes_in_json))
                         ]
                     for instruction in instructions:
                         new_data = self._create_new_data(data, yaws, instruction)
 
@@ -26,7 +26,7 @@
 from internnav.model.basemodel.LongCLIP.model import longclip
 from internnav.model.utils.feature_extract import extract_instruction_tokens
 from internnav.utils.geometry_utils import get_delta, normalize_data, to_local_coords
-from internnav.utils.lerobot_as_lmdb import LerobotAsLmdb
+from internnav.utils.loader import LerobotAsLmdb
 
 
 def _convert_image_to_rgb(image):
@@ -103,8 +103,9 @@ def __init__(
         self.to_pil = ToPILImage()
         self.image_processor = _transform(n_px=224)  # copy from clip-long
         self.lerobot_as_lmdb = LerobotAsLmdb(self.lerobot_features_dir)
-        self.lmdb_keys = self.lerobot_as_lmdb.get_all_keys()
+        self.lmdb_keys = self.lerobot_as_lmdb.get_all_keys(allow_scan_list=['r2r'])  # r2r / r2r_aliengo / r2r_flash
         self.length = len(self.lmdb_keys)
+        print(f"total keys in traj_data: {len(self.lmdb_keys)}")
 
         self.start = 0
         self.end = self.length
@@ -192,7 +193,7 @@ def _load_next(self):  # noqa: C901
                 episodes_in_json = data_to_load['episodes_in_json']
 
                 instructions = [
-                    episodes_in_json[ep_idx]['instruction_text'][: self.config.model.text_encoder.max_length]
+                    episodes_in_json[ep_idx]['task'][: self.config.model.text_encoder.max_length]
                     for ep_idx in range(len(episodes_in_json))
                 ]
 
 
@@ -13,6 +13,7 @@
 
 import cv2
 import habitat
+import imageio
 import numpy as np
 import quaternion
 import torch
@@ -105,6 +106,8 @@ def __init__(self, cfg: EvalCfg):
 
         # ------------------------------------- model ------------------------------------------
         self.model_args = argparse.Namespace(**cfg.agent.model_settings)
+        self.vis_debug = bool(getattr(self.model_args, "vis_debug", False))
+        self.vis_debug_path = getattr(self.model_args, "vis_debug_path", os.path.join(self.output_path, "vis_debug"))
 
         processor = AutoProcessor.from_pretrained(self.model_args.model_path)
         processor.tokenizer.padding_side = 'left'
@@ -256,7 +259,7 @@ def resume_from_output_path(self) -> None:
                         ndtw.append(res['ndtw'])
         return sucs, spls, oss, nes, ndtw
 
-    def _run_eval_dual_system(self) -> tuple:
+    def _run_eval_dual_system(self) -> tuple:  # noqa: C901
         self.model.eval()
 
         # resume from previous results
@@ -288,9 +291,17 @@ def _run_eval_dual_system(self) -> tuple:
 
             vis_frames = []
             step_id = 0
+            vis_writer = None
 
             if self.save_video:
                 os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
+            if self.vis_debug:
+                debug_dir = os.path.join(self.vis_debug_path, f'epoch_{self.epoch}')
+                os.makedirs(debug_dir, exist_ok=True)
+                vis_writer = imageio.get_writer(
+                    os.path.join(debug_dir, f'{scene_id}_{episode_id:04d}.mp4'),
+                    fps=5,
+                )
 
             rgb_list = []
             action_seq = []
@@ -307,6 +318,7 @@ def _run_eval_dual_system(self) -> tuple:
 
             # ---------- 2. Episode step loop -----------
             while (not done) and (step_id <= self.max_steps_per_episode):
+                draw_pixel_goal = False
                 # refactor agent get action
                 rgb = observations["rgb"]
                 depth = observations["depth"]
@@ -422,6 +434,7 @@ def _run_eval_dual_system(self) -> tuple:
                         coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
 
                         pixel_goal = [int(coord[1]), int(coord[0])]
+                        draw_pixel_goal = True
 
                         # look down --> horizontal
                         self.env.step(action_code.LOOKUP)
@@ -526,6 +539,22 @@ def _run_eval_dual_system(self) -> tuple:
 
                 print("step_id", step_id, "action", action)
 
+                if vis_writer is not None:
+                    vis = np.asarray(save_raw_image).copy()
+                    vis = cv2.putText(
+                        vis,
+                        f"step {step_id} action {int(action)}",
+                        (20, 40),
+                        cv2.FONT_HERSHEY_SIMPLEX,
+                        1,
+                        (0, 255, 0),
+                        2,
+                    )
+                    if pixel_goal is not None:
+                        if draw_pixel_goal:
+                            cv2.circle(vis, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1)
+                    vis_writer.append_data(vis)
+
                 if action == action_code.LOOKDOWN:
                     self.env.step(action)
                     observations, _, done, _ = self.env.step(action)
@@ -586,6 +615,8 @@ def _run_eval_dual_system(self) -> tuple:
                     quality=9,
                 )
             vis_frames.clear()
+            if vis_writer is not None:
+                vis_writer.close()
 
         self.env.close()
 
@@ -643,9 +674,17 @@ def _run_eval_system2(self) -> tuple:
 
             vis_frames = []
             step_id = 0
+            vis_writer = None
 
             if self.save_video:
                 os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
+            if self.vis_debug:
+                debug_dir = os.path.join(self.vis_debug_path, f'epoch_{self.epoch}')
+                os.makedirs(debug_dir, exist_ok=True)
+                vis_writer = imageio.get_writer(
+                    os.path.join(debug_dir, f'{scene_id}_{episode_id:04d}.mp4'),
+                    fps=5,
+                )
             initial_height = self.env._env.sim.get_agent_state().position[1]
 
             rgb_list = []
@@ -662,6 +701,7 @@ def _run_eval_system2(self) -> tuple:
 
             # ---------- 2. Episode step loop -----------
             while (not done) and (step_id <= self.max_steps_per_episode):
+                draw_pixel_goal = False
                 # refactor agent get action
                 rgb = observations["rgb"]
                 depth = observations["depth"]
@@ -755,6 +795,7 @@ def _run_eval_system2(self) -> tuple:
                         coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
 
                         pixel_goal = [int(coord[1]), int(coord[0])]
+                        draw_pixel_goal = True
 
                         # look down --> horizontal
                         self.env.step(action_code.LOOKUP)
@@ -818,6 +859,21 @@ def _run_eval_system2(self) -> tuple:
 
                 print("step_id", step_id, "action", action)
 
+                if vis_writer is not None:
+                    vis = np.asarray(save_raw_image).copy()
+                    vis = cv2.putText(
+                        vis,
+                        f"step {step_id} action {int(action)}",
+                        (20, 40),
+                        cv2.FONT_HERSHEY_SIMPLEX,
+                        1,
+                        (0, 255, 0),
+                        2,
+                    )
+                    if draw_pixel_goal:
+                        cv2.circle(vis, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1)
+                    vis_writer.append_data(vis)
+
                 if action == action_code.LOOKDOWN:
                     self.env.step(action)
                     observations, _, done, _ = self.env.step(action)
@@ -875,6 +931,8 @@ def _run_eval_system2(self) -> tuple:
                     quality=9,
                 )
             vis_frames.clear()
+            if vis_writer is not None:
+                vis_writer.close()
 
         self.env.close()