Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 134 additions & 0 deletions config/pathfinder.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
[base]
env_name = pathfinder

[vec]
total_agents = 4096
num_buffers = 1
num_threads = 0

[env]
branch_prob = 0
loop_prob = 0
step_penalty = -0.001
new_wall_penalty = 0.0
known_wall_death_penalty = -1.0
repeat_move_death_penalty = -1.0
new_cell_reward = 0.01
revisit_penalty = -0.01
impossible_penalty = -1.0
goal_reward = 1.0
start_solution_len = 4
curriculum_enabled = 1
max_steps = 128

[policy]
hidden_size = 128
num_layers = 2
expansion_factor = 1

[train]
gpus = 1
seed = 42
total_timesteps = 10000000
learning_rate = 0.0050000000000000044
anneal_lr = 1
min_lr_ratio = 0
gamma = 0.9323750048012285
gae_lambda = 0.6447272834389924
replay_ratio = 3.9541522542643888
clip_coef = 0.04758361994585345
vf_coef = 2.5078946504421316
vf_clip_coef = 3.904257054821052
max_grad_norm = 0.31773469745800403
ent_coef = 0.007301801357163463
anneal_ent_coef = 0
min_ent_coef_ratio = 0.1
beta1 = 0.9
beta2 = 0.999
eps = 2.17069274192998e-12
minibatch_size = 8192
horizon = 32
vtrace_rho_clip = 2.1610638009706147
vtrace_c_clip = 4.02621810163764
prio_alpha = 0.3225588903414761
prio_beta0 = 1.0
anneal_prio_beta = 0
state_buffer_size = 0
cl_frac = 0
anneal_cl = 0
warmup_states = 100
state_checkpoint_interval = 16
explore_alpha = 1
explore_beta = 0
explore_decay = 0.99
use_rnn = 1
env = 0
eval_episodes = 2000

[sweep]
max_suggestion_cost = 7200
max_runs = 80
downsample = 5

[sweep.env.branch_prob]
distribution = uniform
min = 0.0
mean = 0.08
max = 0.22
scale = auto

[sweep.env.loop_prob]
distribution = uniform
min = 0.0
mean = 0.02
max = 0.12
scale = auto

[sweep.train.learning_rate]
distribution = log_normal
min = 0.0001
mean = 0.001
max = 0.005
scale = auto

[sweep.train.total_timesteps]
distribution = log_normal
min = 1e7
mean = 1.5e7
max = 2.5e7
scale = time

[sweep.policy.hidden_size]
distribution = uniform_pow2
min = 32
mean = 64
max = 128
scale = auto

[sweep.policy.num_layers]
distribution = uniform
min = 1
mean = 1
max = 2
scale = auto

[sweep.vec.total_agents]
distribution = uniform_pow2
min = 1024
mean = 1024
max = 8192
scale = auto

[sweep.train.horizon]
distribution = uniform_pow2
min = 16
mean = 32
max = 64
scale = auto

[sweep.train.minibatch_size]
distribution = uniform_pow2
min = 8192
mean = 16384
max = 16384
scale = auto
44 changes: 44 additions & 0 deletions ocean/pathfinder/binding.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#include "pathfinder.h"
#define OBS_SIZE PATHFINDER_OBS_SIZE
#define NUM_ATNS 1
#define ACT_SIZES {4}
#define OBS_TENSOR_T FloatTensor

#define Env Pathfinder
#include "vecenv.h"

void my_init(Env* env, Dict* kwargs) {
env->num_agents = 1;
env->branch_prob = (float)dict_get(kwargs, "branch_prob")->value;
env->loop_prob = (float)dict_get(kwargs, "loop_prob")->value;
env->step_penalty = (float)dict_get(kwargs, "step_penalty")->value;
env->new_wall_penalty = (float)dict_get(kwargs, "new_wall_penalty")->value;
env->known_wall_death_penalty = (float)dict_get(kwargs, "known_wall_death_penalty")->value;
env->repeat_move_death_penalty = (float)dict_get(kwargs, "repeat_move_death_penalty")->value;
env->new_cell_reward = (float)dict_get(kwargs, "new_cell_reward")->value;
env->revisit_penalty = (float)dict_get(kwargs, "revisit_penalty")->value;
env->impossible_penalty = (float)dict_get(kwargs, "impossible_penalty")->value;
env->goal_reward = (float)dict_get(kwargs, "goal_reward")->value;
env->start_solution_len = (int)dict_get(kwargs, "start_solution_len")->value;
env->curriculum_enabled = (int)dict_get(kwargs, "curriculum_enabled")->value;
env->max_steps = (int)dict_get(kwargs, "max_steps")->value;
init(env);
}

void my_log(Log* log, Dict* out) {
dict_set(out, "perf", log->perf);
dict_set(out, "score", log->score);
dict_set(out, "episode_return", log->episode_return);
dict_set(out, "episode_length", log->episode_length);
dict_set(out, "success", log->success);
dict_set(out, "wins", log->wins);
dict_set(out, "wall_hits", log->wall_hits);
dict_set(out, "revisits", log->revisits);
dict_set(out, "known_wall_deaths", log->known_wall_deaths);
dict_set(out, "repeat_move_deaths", log->repeat_move_deaths);
dict_set(out, "shortest_path_len", log->shortest_path_len);
dict_set(out, "agent_path_len", log->agent_path_len);
dict_set(out, "curriculum_level", log->curriculum_level);
dict_set(out, "curriculum_target_len", log->curriculum_target_len);
dict_set(out, "curriculum_next_target_len", log->curriculum_next_target_len);
}
65 changes: 65 additions & 0 deletions ocean/pathfinder/pathfinder.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#include <time.h>
#include "pathfinder.h"

static int read_manual_action(void) {
if (IsKeyPressed(KEY_UP) || IsKeyPressed(KEY_W)) return PATHFINDER_ACT_NORTH;
if (IsKeyPressed(KEY_RIGHT) || IsKeyPressed(KEY_D)) return PATHFINDER_ACT_EAST;
if (IsKeyPressed(KEY_DOWN) || IsKeyPressed(KEY_S)) return PATHFINDER_ACT_SOUTH;
if (IsKeyPressed(KEY_LEFT) || IsKeyPressed(KEY_A)) return PATHFINDER_ACT_WEST;
return -1;
}

int main(void) {
Pathfinder env;
memset(&env, 0, sizeof(env));

float observations[PATHFINDER_OBS_SIZE] = {0};
float actions[1] = {0};
float rewards[1] = {0};
float terminals[1] = {0};

env.observations = observations;
env.actions = actions;
env.rewards = rewards;
env.terminals = terminals;
env.num_agents = 1;
env.player_mode = true;
env.rng = (unsigned int)time(NULL);
env.branch_prob = 0.35f;
env.loop_prob = 0.10f;
env.start_solution_len = 4;
env.curriculum_enabled = 1;
env.max_steps = 128;
env.step_penalty = -0.001f;
env.new_wall_penalty = 0.0f;
env.known_wall_death_penalty = -1.0f;
env.repeat_move_death_penalty = -1.0f;
env.new_cell_reward = 0.01f;
env.revisit_penalty = -0.01f;
env.impossible_penalty = -1.0f;
env.goal_reward = 1.0f;

init(&env);
c_reset(&env);

c_render(&env);
while (!WindowShouldClose()) {
if (IsKeyPressed(KEY_R)) {
c_reset(&env);
}

int action = read_manual_action();
if (action >= 0) {
actions[0] = (float)action;
c_step(&env);
} else if (IsKeyPressed(KEY_SPACE)) {
actions[0] = (float)(pathfinder_rand(&env) % PATHFINDER_NUM_ACTIONS);
c_step(&env);
}

c_render(&env);
}

c_close(&env);
return 0;
}
Loading
Loading