PufferAI · Kinvert · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
diff --git a/config/pathfinder.ini b/config/pathfinder.ini
@@ -0,0 +1,134 @@
+[base]
+env_name = pathfinder
+
+[vec]
+total_agents = 4096
+num_buffers = 1
+num_threads = 0
+
+[env]
+branch_prob = 0
+loop_prob = 0
+step_penalty = -0.001
+new_wall_penalty = 0.0
+known_wall_death_penalty = -1.0
+repeat_move_death_penalty = -1.0
+new_cell_reward = 0.01
+revisit_penalty = -0.01
+impossible_penalty = -1.0
+goal_reward = 1.0
+start_solution_len = 4
+curriculum_enabled = 1
+max_steps = 128
+
+[policy]
+hidden_size = 128
+num_layers = 2
+expansion_factor = 1
+
+[train]
+gpus = 1
+seed = 42
+total_timesteps = 10000000
+learning_rate = 0.0050000000000000044
+anneal_lr = 1
+min_lr_ratio = 0
+gamma = 0.9323750048012285
+gae_lambda = 0.6447272834389924
+replay_ratio = 3.9541522542643888
+clip_coef = 0.04758361994585345
+vf_coef = 2.5078946504421316
+vf_clip_coef = 3.904257054821052
+max_grad_norm = 0.31773469745800403
+ent_coef = 0.007301801357163463
+anneal_ent_coef = 0
+min_ent_coef_ratio = 0.1
+beta1 = 0.9
+beta2 = 0.999
+eps = 2.17069274192998e-12
+minibatch_size = 8192
+horizon = 32
+vtrace_rho_clip = 2.1610638009706147
+vtrace_c_clip = 4.02621810163764
+prio_alpha = 0.3225588903414761
+prio_beta0 = 1.0
+anneal_prio_beta = 0
+state_buffer_size = 0
+cl_frac = 0
+anneal_cl = 0
+warmup_states = 100
+state_checkpoint_interval = 16
+explore_alpha = 1
+explore_beta = 0
+explore_decay = 0.99
+use_rnn = 1
+env = 0
+eval_episodes = 2000
+
+[sweep]
+max_suggestion_cost = 7200
+max_runs = 80
+downsample = 5
+
+[sweep.env.branch_prob]
+distribution = uniform
+min = 0.0
+mean = 0.08
+max = 0.22
+scale = auto
+
+[sweep.env.loop_prob]
+distribution = uniform
+min = 0.0
+mean = 0.02
+max = 0.12
+scale = auto
+
+[sweep.train.learning_rate]
+distribution = log_normal
+min = 0.0001
+mean = 0.001
+max = 0.005
+scale = auto
+
+[sweep.train.total_timesteps]
+distribution = log_normal
+min = 1e7
+mean = 1.5e7
+max = 2.5e7
+scale = time
+
+[sweep.policy.hidden_size]
+distribution = uniform_pow2
+min = 32
+mean = 64
+max = 128
+scale = auto
+
+[sweep.policy.num_layers]
+distribution = uniform
+min = 1
+mean = 1
+max = 2
+scale = auto
+
+[sweep.vec.total_agents]
+distribution = uniform_pow2
+min = 1024
+mean = 1024
+max = 8192
+scale = auto
+
+[sweep.train.horizon]
+distribution = uniform_pow2
+min = 16
+mean = 32
+max = 64
+scale = auto
+
+[sweep.train.minibatch_size]
+distribution = uniform_pow2
+min = 8192
+mean = 16384
+max = 16384
+scale = auto
diff --git a/ocean/pathfinder/binding.c b/ocean/pathfinder/binding.c
@@ -0,0 +1,44 @@
+#include "pathfinder.h"
+#define OBS_SIZE PATHFINDER_OBS_SIZE
+#define NUM_ATNS 1
+#define ACT_SIZES {4}
+#define OBS_TENSOR_T FloatTensor
+
+#define Env Pathfinder
+#include "vecenv.h"
+
+void my_init(Env* env, Dict* kwargs) {
+    env->num_agents = 1;
+    env->branch_prob = (float)dict_get(kwargs, "branch_prob")->value;
+    env->loop_prob = (float)dict_get(kwargs, "loop_prob")->value;
+    env->step_penalty = (float)dict_get(kwargs, "step_penalty")->value;
+    env->new_wall_penalty = (float)dict_get(kwargs, "new_wall_penalty")->value;
+    env->known_wall_death_penalty = (float)dict_get(kwargs, "known_wall_death_penalty")->value;
+    env->repeat_move_death_penalty = (float)dict_get(kwargs, "repeat_move_death_penalty")->value;
+    env->new_cell_reward = (float)dict_get(kwargs, "new_cell_reward")->value;
+    env->revisit_penalty = (float)dict_get(kwargs, "revisit_penalty")->value;
+    env->impossible_penalty = (float)dict_get(kwargs, "impossible_penalty")->value;
+    env->goal_reward = (float)dict_get(kwargs, "goal_reward")->value;
+    env->start_solution_len = (int)dict_get(kwargs, "start_solution_len")->value;
+    env->curriculum_enabled = (int)dict_get(kwargs, "curriculum_enabled")->value;
+    env->max_steps = (int)dict_get(kwargs, "max_steps")->value;
+    init(env);
+}
+
+void my_log(Log* log, Dict* out) {
+    dict_set(out, "perf", log->perf);
+    dict_set(out, "score", log->score);
+    dict_set(out, "episode_return", log->episode_return);
+    dict_set(out, "episode_length", log->episode_length);
+    dict_set(out, "success", log->success);
+    dict_set(out, "wins", log->wins);
+    dict_set(out, "wall_hits", log->wall_hits);
+    dict_set(out, "revisits", log->revisits);
+    dict_set(out, "known_wall_deaths", log->known_wall_deaths);
+    dict_set(out, "repeat_move_deaths", log->repeat_move_deaths);
+    dict_set(out, "shortest_path_len", log->shortest_path_len);
+    dict_set(out, "agent_path_len", log->agent_path_len);
+    dict_set(out, "curriculum_level", log->curriculum_level);
+    dict_set(out, "curriculum_target_len", log->curriculum_target_len);
+    dict_set(out, "curriculum_next_target_len", log->curriculum_next_target_len);
+}
diff --git a/ocean/pathfinder/pathfinder.c b/ocean/pathfinder/pathfinder.c
@@ -0,0 +1,65 @@
+#include <time.h>
+#include "pathfinder.h"
+
+static int read_manual_action(void) {
+    if (IsKeyPressed(KEY_UP) || IsKeyPressed(KEY_W)) return PATHFINDER_ACT_NORTH;
+    if (IsKeyPressed(KEY_RIGHT) || IsKeyPressed(KEY_D)) return PATHFINDER_ACT_EAST;
+    if (IsKeyPressed(KEY_DOWN) || IsKeyPressed(KEY_S)) return PATHFINDER_ACT_SOUTH;
+    if (IsKeyPressed(KEY_LEFT) || IsKeyPressed(KEY_A)) return PATHFINDER_ACT_WEST;
+    return -1;
+}
+
+int main(void) {
+    Pathfinder env;
+    memset(&env, 0, sizeof(env));
+
+    float observations[PATHFINDER_OBS_SIZE] = {0};
+    float actions[1] = {0};
+    float rewards[1] = {0};
+    float terminals[1] = {0};
+
+    env.observations = observations;
+    env.actions = actions;
+    env.rewards = rewards;
+    env.terminals = terminals;
+    env.num_agents = 1;
+    env.player_mode = true;
+    env.rng = (unsigned int)time(NULL);
+    env.branch_prob = 0.35f;
+    env.loop_prob = 0.10f;
+    env.start_solution_len = 4;
+    env.curriculum_enabled = 1;
+    env.max_steps = 128;
+    env.step_penalty = -0.001f;
+    env.new_wall_penalty = 0.0f;
+    env.known_wall_death_penalty = -1.0f;
+    env.repeat_move_death_penalty = -1.0f;
+    env.new_cell_reward = 0.01f;
+    env.revisit_penalty = -0.01f;
+    env.impossible_penalty = -1.0f;
+    env.goal_reward = 1.0f;
+
+    init(&env);
+    c_reset(&env);
+
+    c_render(&env);
+    while (!WindowShouldClose()) {
+        if (IsKeyPressed(KEY_R)) {
+            c_reset(&env);
+        }
+
+        int action = read_manual_action();
+        if (action >= 0) {
+            actions[0] = (float)action;
+            c_step(&env);
+        } else if (IsKeyPressed(KEY_SPACE)) {
+            actions[0] = (float)(pathfinder_rand(&env) % PATHFINDER_NUM_ACTIONS);
+            c_step(&env);
+        }
+
+        c_render(&env);
+    }
+
+    c_close(&env);
+    return 0;
+}