1616from internnav .configs .agent import AgentCfg
1717
1818try :
19+ from depth_camera_filtering import filter_depth
20+ from habitat .tasks .nav .shortest_path_follower import ShortestPathFollower
1921 from transformers import (
2022 AutoProcessor ,
2123 AutoTokenizer ,
2224 Qwen2_5_VLForConditionalGeneration ,
2325 )
24- from depth_camera_filtering import filter_depth
25- from habitat .tasks .nav .shortest_path_follower import ShortestPathFollower
2626except Exception as e :
2727 print (f"Warning: ({ e } ), Habitat Evaluation is not loaded in this runtime. Ignore this if not using Habitat." )
2828
@@ -47,12 +47,12 @@ def split_and_clean(text):
4747@Agent .register ('dialog' )
4848class DialogAgent (Agent ):
4949 """Vision-language navigation agent that can either move or ask an oracle via dialog. The agent builds a multimodal
50- chat prompt from current/historical RGB observations (and optional look-down frames), runs a Qwen2.5-VL model to
51- produce either an action sequence, a pixel waypoint, or a dialog query, then converts the model output into
50+ chat prompt from current/historical RGB observations (and optional look-down frames), runs a Qwen2.5-VL model to
51+ produce either an action sequence, a pixel waypoint, or a dialog query, then converts the model output into
5252 simulator actions and (optionally) a world-space navigation goal.
5353
5454 Args:
55- agent_config (AgentCfg): AgentCfg containing model_settings (e.g., task name, sensor config, model path, mode,
55+ agent_config (AgentCfg): AgentCfg containing model_settings (e.g., task name, sensor config, model path, mode,
5656 resizing, dialog flags, and generation parameters) and runtime info such as local_rank.
5757 """
5858
@@ -440,7 +440,7 @@ def pixel_to_gps(self, pixel, depth, intrinsic, tf_camera_to_episodic):
440440 pixel (Tuple[int, int] | List[int] | np.ndarray): pixel coordinate in (v, u) indexing as used here.
441441 depth (np.ndarray): depth image of shape (H, W) in meters, where depth[v, u] returns the metric depth.
442442 intrinsic (np.ndarray): camera intrinsic matrix.
443- tf_camera_to_episodic (np.ndarray): homogeneous transform of shape (4, 4) mapping camera-frame points to
443+ tf_camera_to_episodic (np.ndarray): homogeneous transform of shape (4, 4) mapping camera-frame points to
444444 the episodic frame.
445445
446446 Returns:
0 commit comments