Skip to content

Commit 1d8d078

Browse files
authored
Bump version to 0.3.1
2 parents 832852a + 5e287ed commit 1d8d078

14 files changed

Lines changed: 497 additions & 24 deletions

File tree

.gitmodules

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
[submodule "diffusion-policy"]
2-
path = src/diffusion-policy
3-
url = https://github.com/real-stanford/diffusion_policy.git
4-
commit = 5ba07ac6661db573af695b419a7947ecb704690f
51
[submodule "internnav/model/basemodel/LongCLIP"]
62
path = internnav/model/basemodel/LongCLIP
73
url = https://github.com/beichenzbc/Long-CLIP
84
commit = 3966af9ae9331666309a22128468b734db4672a7
95
ignore = untracked
6+
[submodule "third_party/diffusion-policy"]
7+
path = third_party/diffusion-policy
8+
url = https://github.com/real-stanford/diffusion_policy.git
9+
commit = 5ba07ac6661db573af695b419a7947ecb704690f

docs/changelog.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,24 @@ All notable changes to this project will be documented in this file.
66

77
Upcoming changes will be tracked in this section.
88

9+
## Changelog of v0.3.1 (2026/02/09)
10+
### Highlights
11+
- Update dataset conversion for InternData-N1 VLN-PE v0.5 format (#288)
12+
- Support `vis_debug` option for Habitat evaluation (#265)
13+
14+
### New Features
15+
- Support `vis_debug` option for Habitat evaluation (#265)
16+
17+
### Improvements
18+
- Update submodule path from `src` to `third_party` (#266)
19+
20+
### Bug Fixes
21+
- Fix VLLN path handling (#228)
22+
- Fix dataset conversion for InternData-N1 VLN-PE v0.5 format (#288)
23+
24+
Full Changelog: https://github.com/InternRobotics/InternNav/compare/release/v0.3.0...release/v0.3.1
25+
26+
927
## Changelog of v0.3.0 (2026/01/05)
1028
### Highlights
1129
- Support training of InternVLA-N1 and evaluation on RxR (#184)

docs/compatibility.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Compatibility
2+
3+
## v0.3.1
4+
5+
### InternData-N1 update to v0.5
6+
7+
The InternData-N1 VLN-PE trajectory training dataset has been upgraded from `v0.1` to `v0.5`. This update introduces minor structural changes in the dataset layout and updates the LeRobot-to-LMDB conversion logic to match the new `v0.5` data structure.
8+
9+
The training pipeline now uses the new key name:
10+
- `instruction_text``task`
11+
12+
The updated conversion logic is **not compatible** with InternData-N1 `v0.1`.

internnav/dataset/cma_lerobot_dataset.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from internnav.dataset.base import BaseDataset, ObservationsDict, _block_shuffle
88
from internnav.model.utils.feature_extract import extract_instruction_tokens
9-
from internnav.utils.lerobot_as_lmdb import LerobotAsLmdb
9+
from internnav.utils.loader import LerobotAsLmdb
1010

1111

1212
class CMALerobotDataset(BaseDataset):
@@ -38,8 +38,9 @@ def __init__(
3838
self.camera_name = self.config.il.camera_name
3939

4040
self.lerobot_as_lmdb = LerobotAsLmdb(self.lerobot_features_dir)
41-
self.lmdb_keys = self.lerobot_as_lmdb.get_all_keys()
41+
self.lmdb_keys = self.lerobot_as_lmdb.get_all_keys(allow_scan_list=['r2r']) # r2r / r2r_aliengo / r2r_flash
4242
self.length = len(self.lmdb_keys)
43+
print(f"total keys in traj_data: {len(self.lmdb_keys)}")
4344

4445
# For CMA-CLIP
4546
self.use_clip_encoders = False
@@ -105,14 +106,12 @@ def _load_next(self):
105106
data['camera_info'][self.camera_name]['rgb'] = data['camera_info'][self.camera_name]['rgb'][
106107
:-drop_last_frame_nums
107108
]
108-
data['camera_info'][self.camera_name]['depth'] = data['camera_info'][self.camera_name][
109-
'depth'
110-
][:-drop_last_frame_nums]
111-
data['robot_info']['yaw'] = data['robot_info']['yaw'][:-drop_last_frame_nums]
112-
data['robot_info']['position'] = data['robot_info']['position'][:-drop_last_frame_nums]
113-
data['robot_info']['orientation'] = data['robot_info']['orientation'][
109+
data['camera_info'][self.camera_name]['depth'] = data['camera_info'][self.camera_name]['depth'][
114110
:-drop_last_frame_nums
115111
]
112+
data['robot_info']['yaw'] = data['robot_info']['yaw'][:-drop_last_frame_nums]
113+
data['robot_info']['position'] = data['robot_info']['position'][:-drop_last_frame_nums]
114+
data['robot_info']['orientation'] = data['robot_info']['orientation'][:-drop_last_frame_nums]
116115
data['progress'] = data['progress'][:-drop_last_frame_nums]
117116
data['step'] = data['step'][:-drop_last_frame_nums]
118117
if 'rgb_features' in data.keys():
@@ -132,13 +131,11 @@ def _load_next(self):
132131

133132
if self.bert_tokenizer is not None:
134133
instructions = [
135-
episodes_in_json[ep_idx]['instruction_text']
136-
for ep_idx in range(len(episodes_in_json))
134+
episodes_in_json[ep_idx]['instruction_text'] for ep_idx in range(len(episodes_in_json))
137135
]
138136
else:
139137
instructions = [
140-
episodes_in_json[ep_idx]['instruction_tokens']
141-
for ep_idx in range(len(episodes_in_json))
138+
episodes_in_json[ep_idx]['instruction_tokens'] for ep_idx in range(len(episodes_in_json))
142139
]
143140
for instruction in instructions:
144141
new_data = self._create_new_data(data, yaws, instruction)

internnav/dataset/rdp_lerobot_dataset.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from internnav.model.basemodel.LongCLIP.model import longclip
2727
from internnav.model.utils.feature_extract import extract_instruction_tokens
2828
from internnav.utils.geometry_utils import get_delta, normalize_data, to_local_coords
29-
from internnav.utils.lerobot_as_lmdb import LerobotAsLmdb
29+
from internnav.utils.loader import LerobotAsLmdb
3030

3131

3232
def _convert_image_to_rgb(image):
@@ -103,8 +103,9 @@ def __init__(
103103
self.to_pil = ToPILImage()
104104
self.image_processor = _transform(n_px=224) # copy from clip-long
105105
self.lerobot_as_lmdb = LerobotAsLmdb(self.lerobot_features_dir)
106-
self.lmdb_keys = self.lerobot_as_lmdb.get_all_keys()
106+
self.lmdb_keys = self.lerobot_as_lmdb.get_all_keys(allow_scan_list=['r2r']) # r2r / r2r_aliengo / r2r_flash
107107
self.length = len(self.lmdb_keys)
108+
print(f"total keys in traj_data: {len(self.lmdb_keys)}")
108109

109110
self.start = 0
110111
self.end = self.length
@@ -192,7 +193,7 @@ def _load_next(self): # noqa: C901
192193
episodes_in_json = data_to_load['episodes_in_json']
193194

194195
instructions = [
195-
episodes_in_json[ep_idx]['instruction_text'][: self.config.model.text_encoder.max_length]
196+
episodes_in_json[ep_idx]['task'][: self.config.model.text_encoder.max_length]
196197
for ep_idx in range(len(episodes_in_json))
197198
]
198199

internnav/habitat_extensions/vln/habitat_vln_evaluator.py

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
import cv2
1515
import habitat
16+
import imageio
1617
import numpy as np
1718
import quaternion
1819
import torch
@@ -105,6 +106,8 @@ def __init__(self, cfg: EvalCfg):
105106

106107
# ------------------------------------- model ------------------------------------------
107108
self.model_args = argparse.Namespace(**cfg.agent.model_settings)
109+
self.vis_debug = bool(getattr(self.model_args, "vis_debug", False))
110+
self.vis_debug_path = getattr(self.model_args, "vis_debug_path", os.path.join(self.output_path, "vis_debug"))
108111

109112
processor = AutoProcessor.from_pretrained(self.model_args.model_path)
110113
processor.tokenizer.padding_side = 'left'
@@ -256,7 +259,7 @@ def resume_from_output_path(self) -> None:
256259
ndtw.append(res['ndtw'])
257260
return sucs, spls, oss, nes, ndtw
258261

259-
def _run_eval_dual_system(self) -> tuple:
262+
def _run_eval_dual_system(self) -> tuple: # noqa: C901
260263
self.model.eval()
261264

262265
# resume from previous results
@@ -288,9 +291,17 @@ def _run_eval_dual_system(self) -> tuple:
288291

289292
vis_frames = []
290293
step_id = 0
294+
vis_writer = None
291295

292296
if self.save_video:
293297
os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
298+
if self.vis_debug:
299+
debug_dir = os.path.join(self.vis_debug_path, f'epoch_{self.epoch}')
300+
os.makedirs(debug_dir, exist_ok=True)
301+
vis_writer = imageio.get_writer(
302+
os.path.join(debug_dir, f'{scene_id}_{episode_id:04d}.mp4'),
303+
fps=5,
304+
)
294305

295306
rgb_list = []
296307
action_seq = []
@@ -307,6 +318,7 @@ def _run_eval_dual_system(self) -> tuple:
307318

308319
# ---------- 2. Episode step loop -----------
309320
while (not done) and (step_id <= self.max_steps_per_episode):
321+
draw_pixel_goal = False
310322
# refactor agent get action
311323
rgb = observations["rgb"]
312324
depth = observations["depth"]
@@ -422,6 +434,7 @@ def _run_eval_dual_system(self) -> tuple:
422434
coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
423435

424436
pixel_goal = [int(coord[1]), int(coord[0])]
437+
draw_pixel_goal = True
425438

426439
# look down --> horizontal
427440
self.env.step(action_code.LOOKUP)
@@ -526,6 +539,22 @@ def _run_eval_dual_system(self) -> tuple:
526539

527540
print("step_id", step_id, "action", action)
528541

542+
if vis_writer is not None:
543+
vis = np.asarray(save_raw_image).copy()
544+
vis = cv2.putText(
545+
vis,
546+
f"step {step_id} action {int(action)}",
547+
(20, 40),
548+
cv2.FONT_HERSHEY_SIMPLEX,
549+
1,
550+
(0, 255, 0),
551+
2,
552+
)
553+
if pixel_goal is not None:
554+
if draw_pixel_goal:
555+
cv2.circle(vis, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1)
556+
vis_writer.append_data(vis)
557+
529558
if action == action_code.LOOKDOWN:
530559
self.env.step(action)
531560
observations, _, done, _ = self.env.step(action)
@@ -586,6 +615,8 @@ def _run_eval_dual_system(self) -> tuple:
586615
quality=9,
587616
)
588617
vis_frames.clear()
618+
if vis_writer is not None:
619+
vis_writer.close()
589620

590621
self.env.close()
591622

@@ -643,9 +674,17 @@ def _run_eval_system2(self) -> tuple:
643674

644675
vis_frames = []
645676
step_id = 0
677+
vis_writer = None
646678

647679
if self.save_video:
648680
os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
681+
if self.vis_debug:
682+
debug_dir = os.path.join(self.vis_debug_path, f'epoch_{self.epoch}')
683+
os.makedirs(debug_dir, exist_ok=True)
684+
vis_writer = imageio.get_writer(
685+
os.path.join(debug_dir, f'{scene_id}_{episode_id:04d}.mp4'),
686+
fps=5,
687+
)
649688
initial_height = self.env._env.sim.get_agent_state().position[1]
650689

651690
rgb_list = []
@@ -662,6 +701,7 @@ def _run_eval_system2(self) -> tuple:
662701

663702
# ---------- 2. Episode step loop -----------
664703
while (not done) and (step_id <= self.max_steps_per_episode):
704+
draw_pixel_goal = False
665705
# refactor agent get action
666706
rgb = observations["rgb"]
667707
depth = observations["depth"]
@@ -755,6 +795,7 @@ def _run_eval_system2(self) -> tuple:
755795
coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
756796

757797
pixel_goal = [int(coord[1]), int(coord[0])]
798+
draw_pixel_goal = True
758799

759800
# look down --> horizontal
760801
self.env.step(action_code.LOOKUP)
@@ -818,6 +859,21 @@ def _run_eval_system2(self) -> tuple:
818859

819860
print("step_id", step_id, "action", action)
820861

862+
if vis_writer is not None:
863+
vis = np.asarray(save_raw_image).copy()
864+
vis = cv2.putText(
865+
vis,
866+
f"step {step_id} action {int(action)}",
867+
(20, 40),
868+
cv2.FONT_HERSHEY_SIMPLEX,
869+
1,
870+
(0, 255, 0),
871+
2,
872+
)
873+
if draw_pixel_goal:
874+
cv2.circle(vis, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1)
875+
vis_writer.append_data(vis)
876+
821877
if action == action_code.LOOKDOWN:
822878
self.env.step(action)
823879
observations, _, done, _ = self.env.step(action)
@@ -875,6 +931,8 @@ def _run_eval_system2(self) -> tuple:
875931
quality=9,
876932
)
877933
vis_frames.clear()
934+
if vis_writer is not None:
935+
vis_writer.close()
878936

879937
self.env.close()
880938

0 commit comments

Comments
 (0)