DriveMoE/config/eval/DrivePi0/open_loop.yaml at main · Thinklab-SJTU/DriveMoE · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
defaults:
  - _self_
hydra:
  run:
    dir: ${log_dir}
_target_: src.agent.drivepi0.eval.DrivePiZeroEvalAgent

log_dir: log/eval/${name}/${now:%Y-%m-%d}_${now:%H-%M}_${seed}
name: drive-pi0
device: cuda
gpu_id: 0
n_nodes: 1
seed: 42
action_expert_adaptive_mode:
quantize: False
lora: False
lora_r: 32
lora_dropout: 0.0
pretrained_model_path: ckpts/paligemma-3b-pt-224
checkpoint_path: "YOUR_CHECKPOINT_PATH"  # You need to set path

data:
  statistics_path: config/statistics/b2d_statistics.json
  work_dir: exp/b2d_action
  split: val
  return_camera_id: False
  is_drivemoe: False
  set_scene_priority: False

eval_thresholds: [0.05, 0.1, 0.2, 0.3, 0.5]
device_batch_size: 32

num_workers: 32
flow_sampling: beta
num_inference_steps: 10
final_action_clip_value: 1.0

cond_steps: 5
horizon_steps: 20
action_dim: 2
proprio_dim: 10
max_seq_len: 532
tokenizer_padding: max_length
max_image_text_tokens: ${max_seq_len}

mixture:
  vlm:
    hidden_size: 2048
    intermediate_size: 16384
    use_final_norm: False
    cache: True
    use_quantize: ${quantize}
    use_lora: ${lora}
    adaptive_mode:
  proprio:
    hidden_size: 1024
    intermediate_size: 4096
    use_final_norm: True
    cache: True
    use_quantize: False
    use_lora: False
    adaptive_mode: ${action_expert_adaptive_mode}
  action:
    hidden_size: 1024
    intermediate_size: 4096
    use_final_norm: True
    cache: False
    use_quantize: False
    use_lora: False
    adaptive_mode: ${action_expert_adaptive_mode}
time_hidden_size: 256

image_token_index: 257152
vocab_size: 257216
pad_token_id: 0

vision:
  _target_: src.model.paligemma.siglip.SiglipVisionModel
  config:
    hidden_size: 1152
    intermediate_size: 4304
    num_hidden_layers: 27
    num_attention_heads: 16
    num_channels: 3
    image_size: 224
    patch_size: 14
    layer_norm_eps: 1e-6
    attention_dropout: 0.0
    num_image_tokens: 512
    lora:
      r: ${lora_r}
      dropout: ${lora_dropout}
  use_quantize: ${quantize}
  use_lora: ${lora}

vision_projector:
  _target_: src.model.paligemma.siglip.PaliGemmaMultiModalProjector
  config:
    vision_config:
      hidden_size: 1152
      projection_dim: 2048
    lora:
      r: ${lora_r}
      dropout: ${lora_dropout}
  use_quantize: ${quantize}
  use_lora: ${lora}

joint:
  _target_: src.model.DrivePi0.joint_model.JointModel
  config:
    action_expert_adaptive_mode: ${action_expert_adaptive_mode}
    time_hidden_size: ${time_hidden_size}
    mixture: ${mixture}
    lora:
      r: ${lora_r}
      dropout: ${lora_dropout}
    num_hidden_layers: 18
    num_attention_heads: 8
    num_key_value_heads: 1
    head_dim: 256
    max_position_embeddings: 8192
    rms_norm_eps: 1e-6
    rope_theta: 10000.0
    attention_bias: False
    attention_dropout: 0.0
    pad_token_id: ${pad_token_id}