Skip to content

Commit 1257cea

Browse files
committed
messy self play tourney
1 parent 98be28a commit 1257cea

7 files changed

Lines changed: 310 additions & 109 deletions

File tree

config/default.ini

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ min_games = 2048
3737
elo_init = 0.0
3838
elo_k = 16.0
3939
seed = 42
40+
# Optional league-state JSON to use as a frozen external opponent pool instead
41+
# of the current run's snapshot pool. Empty keeps the default snapshot behavior.
42+
external_opponent_state_path =
4043
# Add a snapshot to the pool every snapshot_interval global steps, independent
4144
# of swap. 0 disables interval snapshotting (pool stays at bootstrap).
4245
snapshot_interval = 1_000_000_000
@@ -122,17 +125,16 @@ bot_eval_burnin_episodes = 0
122125
bot_eval_policy = -1
123126
bot_eval_max_ticks = 0
124127

125-
# League sweep mode for Robocode: train on all but one GPU while a long-lived
126-
# match worker rates completed final checkpoints. league_train_gpus = 0 means
127-
# auto: use sweep.gpus - league_match_gpus. Ignored unless league = True.
128+
# League sweep mode for Robocode: train independent historical-selfplay trials
129+
# on all but one GPU while a long-lived match worker rates completed final
130+
# checkpoints. league_train_gpus = 0 means auto: use sweep.gpus -
131+
# league_match_gpus. Ignored unless league = True.
128132
league = False
129133
league_train_gpus = 0
130134
league_match_gpus = 1
131135
league_match_games = 4096
132136
league_match_eval_agents = 8192
133137
league_anchor_prob = 0.12
134-
league_opponent_frac = 0.20
135-
league_opponent_swap_steps = 100_000_000
136138
league_state_path = ''
137139

138140
[sweep.train.total_timesteps]

config/robocode.ini

Lines changed: 59 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,10 @@ width = 800
2727
height = 600
2828
reward_damage = 0.0
2929
reward_spot = 0.0
30-
dr = 0.181647854498119
30+
reward_melee_damage_inflicted = 0.0
31+
reward_damage_taken = 0.0
32+
reward_range_damage_inflicted = 0.0
33+
dr = 0.26211986603159293
3134
bot_policy = 1
3235
max_ticks = 3000
3336

@@ -38,54 +41,79 @@ num_layers = 3
3841
[train]
3942
gpus = 1
4043
seed = 42
44+
#total_timesteps = 4_866_912_352
4145
total_timesteps = 2_000_000_000
42-
learning_rate = 0.01
46+
learning_rate = 0.0009396226782705388
4347
anneal_lr = 1
4448
min_lr_ratio = 0
45-
gamma = 0.9998480946489037
46-
gae_lambda = 0.9786838910227905
47-
horizon = 256
49+
gamma = 0.9999
50+
gae_lambda = 0.9806795173706343
51+
replay_ratio = 0.25
52+
clip_coef = 0.376885220359416
53+
vf_coef = 1.7165574392111924
54+
vf_clip_coef = 0.1043072795031198
55+
max_grad_norm = 0.5950188529406295
56+
ent_coef = 1.1745624395236018e-05
57+
anneal_ent_coef = 0
58+
min_ent_coef_ratio = 0.1
59+
beta1 = 0.807491153888714
60+
beta2 = 0.9995753999950486
61+
eps = 1e-14
62+
minibatch_size = 8192
63+
horizon = 64
64+
vtrace_rho_clip = 1.1533403498692292
65+
vtrace_c_clip = 1.5182455166896063
66+
prio_alpha = 0.6808270261043561
67+
prio_beta0 = 0.4740573494990283
4868

4969
[sweep]
5070
league = True
51-
match_enemy_model_path = 'resources/robocode/best_robo.bin'
52-
match_num_games = 4096
53-
match_max_ticks = 4096
54-
match_enemy_hidden_size = 1024
55-
match_enemy_num_layers = 2.69591
56-
bot_eval = True
57-
bot_eval_episodes = 32768
58-
bot_eval_envs = 4096
59-
bot_eval_burnin_episodes = 4096
60-
bot_eval_policy = -1
61-
bot_eval_max_ticks = 0
6271
downsample = 1
63-
metric = bot_perf
72+
metric = elo
6473

6574
[sweep.train.total_timesteps]
6675
distribution = log_normal
67-
min = 1e8
68-
max = 1e9
76+
min = 5e8
77+
max = 1e11
6978
mean = 5e8
7079
scale = auto
7180

72-
#[sweep.env.reward_damage]
73-
#distribution = uniform
74-
#min = 0.0
75-
#max = 0.1
76-
#mean = 0.01
77-
#scale = auto
81+
[sweep.policy.hidden_size]
82+
distribution = uniform_pow2
83+
min = 32
84+
max = 1024
85+
scale = auto
7886

79-
#[sweep.env.reward_spot]
80-
#distribution = uniform
81-
#min = 0.0
82-
#max = 0.01
83-
#mean = 0.001
84-
#scale = auto
87+
[sweep.policy.num_layers]
88+
distribution = uniform
89+
min = 1
90+
max = 8
91+
scale = auto
8592

8693
[sweep.env.dr]
8794
distribution = uniform
8895
min = 0.0
8996
max = 0.6
9097
mean = 0.3
9198
scale = auto
99+
100+
[sweep.env.reward_melee_damage_inflicted]
101+
distribution = uniform
102+
min = 0.0
103+
max = 0.02
104+
mean = 0.005
105+
scale = auto
106+
107+
[sweep.env.reward_range_damage_inflicted]
108+
distribution = uniform
109+
min = 0.0
110+
max = 0.02
111+
mean = 0.005
112+
scale = auto
113+
114+
[sweep.env.reward_damage_taken]
115+
distribution = uniform
116+
min = -0.02
117+
max = 0.0
118+
mean = -0.005
119+
scale = auto

ocean/robocode/binding.c

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,35 @@ void my_setup_perm(StaticVec* vec, Env* env, int slot_base) {
2323
}
2424
}
2525

26+
27+
static inline float dict_get_float_default(Dict* kwargs, const char* key, float default_value) {
28+
DictItem* item = dict_get_unsafe(kwargs, key);
29+
return item ? (float)item->value : default_value;
30+
}
31+
2632
void my_init(Env* env, Dict* kwargs) {
2733
env->width = dict_get(kwargs, "width")->value;
2834
env->height = dict_get(kwargs, "height")->value;
2935
env->num_agents = dict_get(kwargs, "num_agents")->value;
3036
env->num_bots = dict_get(kwargs, "num_bots")->value;
3137
env->max_ticks = (int)dict_get(kwargs, "max_ticks")->value;
32-
env->reward_damage = dict_get(kwargs, "reward_damage")->value;
33-
env->reward_spot = dict_get(kwargs, "reward_spot")->value;
38+
env->reward_damage = dict_get_float_default(kwargs, "reward_damage", 0.0f);
39+
env->reward_spot = dict_get_float_default(kwargs, "reward_spot", 0.0f);
40+
env->reward_melee_damage_inflicted = dict_get_float_default(kwargs, "reward_melee_damage_inflicted", 0.0f);
41+
env->reward_damage_taken = dict_get_float_default(kwargs, "reward_damage_taken", 0.0f);
42+
env->reward_range_damage_inflicted = dict_get_float_default(kwargs, "reward_range_damage_inflicted", 0.0f);
43+
env->reward_melee_damage_inflicted_slot_0 = dict_get_float_default(kwargs,
44+
"reward_melee_damage_inflicted_slot_0", env->reward_melee_damage_inflicted);
45+
env->reward_damage_taken_slot_0 = dict_get_float_default(kwargs,
46+
"reward_damage_taken_slot_0", env->reward_damage_taken);
47+
env->reward_range_damage_inflicted_slot_0 = dict_get_float_default(kwargs,
48+
"reward_range_damage_inflicted_slot_0", env->reward_range_damage_inflicted);
49+
env->reward_melee_damage_inflicted_slot_1 = dict_get_float_default(kwargs,
50+
"reward_melee_damage_inflicted_slot_1", env->reward_melee_damage_inflicted);
51+
env->reward_damage_taken_slot_1 = dict_get_float_default(kwargs,
52+
"reward_damage_taken_slot_1", env->reward_damage_taken);
53+
env->reward_range_damage_inflicted_slot_1 = dict_get_float_default(kwargs,
54+
"reward_range_damage_inflicted_slot_1", env->reward_range_damage_inflicted);
3455
DictItem* dr_item = dict_get_unsafe(kwargs, "dr");
3556
env->dr = dr_item ? (float)dr_item->value : 0.0f;
3657
env->bot_policy = dict_get(kwargs, "bot_policy")->value;
@@ -41,6 +62,9 @@ void my_log(Log* log, Dict* out) {
4162
dict_set(out, "perf", log->perf);
4263
dict_set(out, "score", log->score);
4364
dict_set(out, "damage_received", log->damage_received);
65+
dict_set(out, "melee_damage_inflicted", log->melee_damage_inflicted);
66+
dict_set(out, "damage_taken", log->damage_taken);
67+
dict_set(out, "range_damage_inflicted", log->range_damage_inflicted);
4468
dict_set(out, "episode_return", log->episode_return);
4569
dict_set(out, "episode_length", log->episode_length);
4670
// Historical-pool stats. selfplay.py reads hist_score_bank_<b> /

0 commit comments

Comments
 (0)