diff --git a/config/pathfinder.ini b/config/pathfinder.ini new file mode 100644 index 0000000000..4853d0779c --- /dev/null +++ b/config/pathfinder.ini @@ -0,0 +1,134 @@ +[base] +env_name = pathfinder + +[vec] +total_agents = 4096 +num_buffers = 1 +num_threads = 0 + +[env] +branch_prob = 0 +loop_prob = 0 +step_penalty = -0.001 +new_wall_penalty = 0.0 +known_wall_death_penalty = -1.0 +repeat_move_death_penalty = -1.0 +new_cell_reward = 0.01 +revisit_penalty = -0.01 +impossible_penalty = -1.0 +goal_reward = 1.0 +start_solution_len = 4 +curriculum_enabled = 1 +max_steps = 128 + +[policy] +hidden_size = 128 +num_layers = 2 +expansion_factor = 1 + +[train] +gpus = 1 +seed = 42 +total_timesteps = 10000000 +learning_rate = 0.0050000000000000044 +anneal_lr = 1 +min_lr_ratio = 0 +gamma = 0.9323750048012285 +gae_lambda = 0.6447272834389924 +replay_ratio = 3.9541522542643888 +clip_coef = 0.04758361994585345 +vf_coef = 2.5078946504421316 +vf_clip_coef = 3.904257054821052 +max_grad_norm = 0.31773469745800403 +ent_coef = 0.007301801357163463 +anneal_ent_coef = 0 +min_ent_coef_ratio = 0.1 +beta1 = 0.9 +beta2 = 0.999 +eps = 2.17069274192998e-12 +minibatch_size = 8192 +horizon = 32 +vtrace_rho_clip = 2.1610638009706147 +vtrace_c_clip = 4.02621810163764 +prio_alpha = 0.3225588903414761 +prio_beta0 = 1.0 +anneal_prio_beta = 0 +state_buffer_size = 0 +cl_frac = 0 +anneal_cl = 0 +warmup_states = 100 +state_checkpoint_interval = 16 +explore_alpha = 1 +explore_beta = 0 +explore_decay = 0.99 +use_rnn = 1 +env = 0 +eval_episodes = 2000 + +[sweep] +max_suggestion_cost = 7200 +max_runs = 80 +downsample = 5 + +[sweep.env.branch_prob] +distribution = uniform +min = 0.0 +mean = 0.08 +max = 0.22 +scale = auto + +[sweep.env.loop_prob] +distribution = uniform +min = 0.0 +mean = 0.02 +max = 0.12 +scale = auto + +[sweep.train.learning_rate] +distribution = log_normal +min = 0.0001 +mean = 0.001 +max = 0.005 +scale = auto + +[sweep.train.total_timesteps] +distribution = log_normal +min = 1e7 +mean = 1.5e7 +max = 2.5e7 +scale = time + +[sweep.policy.hidden_size] +distribution = uniform_pow2 +min = 32 +mean = 64 +max = 128 +scale = auto + +[sweep.policy.num_layers] +distribution = uniform +min = 1 +mean = 1 +max = 2 +scale = auto + +[sweep.vec.total_agents] +distribution = uniform_pow2 +min = 1024 +mean = 1024 +max = 8192 +scale = auto + +[sweep.train.horizon] +distribution = uniform_pow2 +min = 16 +mean = 32 +max = 64 +scale = auto + +[sweep.train.minibatch_size] +distribution = uniform_pow2 +min = 8192 +mean = 16384 +max = 16384 +scale = auto diff --git a/ocean/pathfinder/binding.c b/ocean/pathfinder/binding.c new file mode 100644 index 0000000000..fa01df3796 --- /dev/null +++ b/ocean/pathfinder/binding.c @@ -0,0 +1,44 @@ +#include "pathfinder.h" +#define OBS_SIZE PATHFINDER_OBS_SIZE +#define NUM_ATNS 1 +#define ACT_SIZES {4} +#define OBS_TENSOR_T FloatTensor + +#define Env Pathfinder +#include "vecenv.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = 1; + env->branch_prob = (float)dict_get(kwargs, "branch_prob")->value; + env->loop_prob = (float)dict_get(kwargs, "loop_prob")->value; + env->step_penalty = (float)dict_get(kwargs, "step_penalty")->value; + env->new_wall_penalty = (float)dict_get(kwargs, "new_wall_penalty")->value; + env->known_wall_death_penalty = (float)dict_get(kwargs, "known_wall_death_penalty")->value; + env->repeat_move_death_penalty = (float)dict_get(kwargs, "repeat_move_death_penalty")->value; + env->new_cell_reward = (float)dict_get(kwargs, "new_cell_reward")->value; + env->revisit_penalty = (float)dict_get(kwargs, "revisit_penalty")->value; + env->impossible_penalty = (float)dict_get(kwargs, "impossible_penalty")->value; + env->goal_reward = (float)dict_get(kwargs, "goal_reward")->value; + env->start_solution_len = (int)dict_get(kwargs, "start_solution_len")->value; + env->curriculum_enabled = (int)dict_get(kwargs, "curriculum_enabled")->value; + env->max_steps = (int)dict_get(kwargs, "max_steps")->value; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "success", log->success); + dict_set(out, "wins", log->wins); + dict_set(out, "wall_hits", log->wall_hits); + dict_set(out, "revisits", log->revisits); + dict_set(out, "known_wall_deaths", log->known_wall_deaths); + dict_set(out, "repeat_move_deaths", log->repeat_move_deaths); + dict_set(out, "shortest_path_len", log->shortest_path_len); + dict_set(out, "agent_path_len", log->agent_path_len); + dict_set(out, "curriculum_level", log->curriculum_level); + dict_set(out, "curriculum_target_len", log->curriculum_target_len); + dict_set(out, "curriculum_next_target_len", log->curriculum_next_target_len); +} diff --git a/ocean/pathfinder/pathfinder.c b/ocean/pathfinder/pathfinder.c new file mode 100644 index 0000000000..b42fd33279 --- /dev/null +++ b/ocean/pathfinder/pathfinder.c @@ -0,0 +1,65 @@ +#include +#include "pathfinder.h" + +static int read_manual_action(void) { + if (IsKeyPressed(KEY_UP) || IsKeyPressed(KEY_W)) return PATHFINDER_ACT_NORTH; + if (IsKeyPressed(KEY_RIGHT) || IsKeyPressed(KEY_D)) return PATHFINDER_ACT_EAST; + if (IsKeyPressed(KEY_DOWN) || IsKeyPressed(KEY_S)) return PATHFINDER_ACT_SOUTH; + if (IsKeyPressed(KEY_LEFT) || IsKeyPressed(KEY_A)) return PATHFINDER_ACT_WEST; + return -1; +} + +int main(void) { + Pathfinder env; + memset(&env, 0, sizeof(env)); + + float observations[PATHFINDER_OBS_SIZE] = {0}; + float actions[1] = {0}; + float rewards[1] = {0}; + float terminals[1] = {0}; + + env.observations = observations; + env.actions = actions; + env.rewards = rewards; + env.terminals = terminals; + env.num_agents = 1; + env.player_mode = true; + env.rng = (unsigned int)time(NULL); + env.branch_prob = 0.35f; + env.loop_prob = 0.10f; + env.start_solution_len = 4; + env.curriculum_enabled = 1; + env.max_steps = 128; + env.step_penalty = -0.001f; + env.new_wall_penalty = 0.0f; + env.known_wall_death_penalty = -1.0f; + env.repeat_move_death_penalty = -1.0f; + env.new_cell_reward = 0.01f; + env.revisit_penalty = -0.01f; + env.impossible_penalty = -1.0f; + env.goal_reward = 1.0f; + + init(&env); + c_reset(&env); + + c_render(&env); + while (!WindowShouldClose()) { + if (IsKeyPressed(KEY_R)) { + c_reset(&env); + } + + int action = read_manual_action(); + if (action >= 0) { + actions[0] = (float)action; + c_step(&env); + } else if (IsKeyPressed(KEY_SPACE)) { + actions[0] = (float)(pathfinder_rand(&env) % PATHFINDER_NUM_ACTIONS); + c_step(&env); + } + + c_render(&env); + } + + c_close(&env); + return 0; +} diff --git a/ocean/pathfinder/pathfinder.h b/ocean/pathfinder/pathfinder.h new file mode 100644 index 0000000000..65710873db --- /dev/null +++ b/ocean/pathfinder/pathfinder.h @@ -0,0 +1,868 @@ +#pragma once + +#include +#include +#include +#include +#include + +#if !defined(PATHFINDER_NO_RENDER) && !defined(PUFFER_PERF_NO_RENDER) +#include "raylib.h" +#endif + +#define PATHFINDER_ROWS 6 +#define PATHFINDER_COLS 6 +#define PATHFINDER_VERTICAL_WALLS (PATHFINDER_ROWS * (PATHFINDER_COLS + 1)) +#define PATHFINDER_HORIZONTAL_WALLS ((PATHFINDER_ROWS + 1) * PATHFINDER_COLS) +#define PATHFINDER_NUM_WALLS (PATHFINDER_VERTICAL_WALLS + PATHFINDER_HORIZONTAL_WALLS) +#define PATHFINDER_OBS_SIZE (PATHFINDER_NUM_WALLS + 2) +#define PATHFINDER_NUM_ACTIONS 4 +#define PATHFINDER_MAX_SOLUTION_LEN ((PATHFINDER_ROWS - 1) + (PATHFINDER_COLS - 1)) +#define PATHFINDER_MAX_PATH_CELLS (PATHFINDER_ROWS * PATHFINDER_COLS) + +#define PATHFINDER_RENDER_MAX_SIDE ((PATHFINDER_ROWS > PATHFINDER_COLS) ? PATHFINDER_ROWS : PATHFINDER_COLS) +#define PATHFINDER_RENDER_TILE (432 / PATHFINDER_RENDER_MAX_SIDE) +#define PATHFINDER_RENDER_MARGIN 40 +#define PATHFINDER_RENDER_BOARD_X PATHFINDER_RENDER_MARGIN +#define PATHFINDER_RENDER_BOARD_Y 92 +#define PATHFINDER_RENDER_BOARD_SIZE (PATHFINDER_RENDER_TILE * PATHFINDER_COLS) +#define PATHFINDER_RENDER_PANEL_WIDTH 332 +#define PATHFINDER_RENDER_WIDTH \ + (PATHFINDER_RENDER_BOARD_X + PATHFINDER_RENDER_BOARD_SIZE + \ + PATHFINDER_RENDER_PANEL_WIDTH + PATHFINDER_RENDER_MARGIN) +#define PATHFINDER_RENDER_HEIGHT \ + (PATHFINDER_RENDER_BOARD_Y + PATHFINDER_RENDER_BOARD_SIZE + 168) + +#define PATHFINDER_ACT_NORTH 0 +#define PATHFINDER_ACT_EAST 1 +#define PATHFINDER_ACT_SOUTH 2 +#define PATHFINDER_ACT_WEST 3 + +#define PATHFINDER_UNKNOWN -1.0f +#define PATHFINDER_OPEN 0.0f +#define PATHFINDER_WALL 1.0f + +typedef struct Log { + float perf; + float score; + float episode_return; + float episode_length; + float success; + float wins; + float wall_hits; + float revisits; + float known_wall_deaths; + float repeat_move_deaths; + float shortest_path_len; + float agent_path_len; + float curriculum_level; + float curriculum_target_len; + float curriculum_next_target_len; + float n; +} Log; + +typedef struct State { + int tick; + int agent_row; + int agent_col; + int goal_row; + int goal_col; + int shortest_path_len; + int agent_path_len; + int wall_hits; + int revisit_count; + int known_wall_death; + int repeat_move_death; + int visited_count; + int success; + float episode_return; + unsigned char visited[PATHFINDER_ROWS][PATHFINDER_COLS]; + unsigned char recent_rows[3]; + unsigned char recent_cols[3]; + int recent_count; + unsigned char true_walls[PATHFINDER_NUM_WALLS]; + float known_walls[PATHFINDER_NUM_WALLS]; +} State; + +typedef struct PathfinderClient { + bool show_truth; +} PathfinderClient; + +typedef struct Pathfinder { + PathfinderClient* client; + bool player_mode; + Log log; + float* observations; + float* actions; + float* rewards; + float* terminals; + int num_agents; + unsigned int rng; + float branch_prob; + float loop_prob; + float step_penalty; + float new_wall_penalty; + float known_wall_death_penalty; + float repeat_move_death_penalty; + float new_cell_reward; + float revisit_penalty; + float impossible_penalty; + float goal_reward; + int start_solution_len; + int curriculum_enabled; + int max_steps; + int curriculum_level; + int curriculum_episodes; + State state; +} Pathfinder; + +static inline int v_wall_idx(int row, int edge_col) { + return row * (PATHFINDER_COLS + 1) + edge_col; +} + +static inline int h_wall_idx(int edge_row, int col) { + return PATHFINDER_VERTICAL_WALLS + edge_row * PATHFINDER_COLS + col; +} + +static inline bool in_bounds(int row, int col) { + return row >= 0 && row < PATHFINDER_ROWS && col >= 0 && col < PATHFINDER_COLS; +} + +static inline unsigned int rand_u32(Pathfinder* env) { + env->rng = 1664525u * env->rng + 1013904223u; + return env->rng; +} + +static inline bool rand_chance_u8(Pathfinder* env, int threshold, + unsigned int* samples, int* remaining) { + if (threshold <= 0) { + return false; + } + if (threshold >= 256) { + return true; + } + if (*remaining == 0) { + *samples = rand_u32(env); + *remaining = 4; + } + unsigned int sample = *samples & 0xffu; + *samples >>= 8; + *remaining -= 1; + return (int)sample < threshold; +} + +static inline int wall_idx_between(int row, int col, int next_row, int next_col) { + if (next_row == row && next_col == col + 1) { + return v_wall_idx(row, col + 1); + } + if (next_row == row && next_col == col - 1) { + return v_wall_idx(row, col); + } + if (next_col == col && next_row == row + 1) { + return h_wall_idx(row + 1, col); + } + if (next_col == col && next_row == row - 1) { + return h_wall_idx(row, col); + } + return -1; +} + +static inline void open_edge(State* s, int row, int col, int next_row, int next_col) { + int wall_idx = wall_idx_between(row, col, next_row, next_col); + if (wall_idx >= 0) { + s->true_walls[wall_idx] = 0; + } +} + +static inline void mark_visited(State* s, int row, int col) { + if (s->visited[row][col]) { + return; + } + s->visited[row][col] = 1; + s->visited_count++; +} + +static inline void reset_move_history(State* s) { + s->recent_rows[0] = (unsigned char)s->agent_row; + s->recent_cols[0] = (unsigned char)s->agent_col; + s->recent_count = 1; +} + +static inline bool repeats_two_cell_cycle( + const State* s, int next_row, int next_col) { + return s->recent_count >= 3 && + s->recent_rows[0] == s->agent_row && + s->recent_cols[0] == s->agent_col && + s->recent_rows[1] == next_row && + s->recent_cols[1] == next_col; +} + +static inline void record_successful_move(State* s) { + if (s->recent_count < 3) { + int idx = s->recent_count++; + s->recent_rows[idx] = (unsigned char)s->agent_row; + s->recent_cols[idx] = (unsigned char)s->agent_col; + return; + } + + s->recent_rows[0] = s->recent_rows[1]; + s->recent_cols[0] = s->recent_cols[1]; + s->recent_rows[1] = s->recent_rows[2]; + s->recent_cols[1] = s->recent_cols[2]; + s->recent_rows[2] = (unsigned char)s->agent_row; + s->recent_cols[2] = (unsigned char)s->agent_col; +} + +static inline void action_delta(int action, int* d_row, int* d_col) { + *d_row = 0; + *d_col = 0; + if (action == PATHFINDER_ACT_NORTH) { + *d_row = -1; + } else if (action == PATHFINDER_ACT_EAST) { + *d_col = 1; + } else if (action == PATHFINDER_ACT_SOUTH) { + *d_row = 1; + } else if (action == PATHFINDER_ACT_WEST) { + *d_col = -1; + } else { + *d_row = PATHFINDER_ROWS; + *d_col = PATHFINDER_ROWS; + } +} + +static int shortest_path(const State* s) { + int dist[PATHFINDER_ROWS][PATHFINDER_COLS]; + int queue[PATHFINDER_ROWS * PATHFINDER_COLS]; + memset(dist, -1, sizeof(dist)); + + int head = 0; + int tail = 0; + dist[0][0] = 0; + queue[tail++] = 0; + + static const int d_rows[PATHFINDER_NUM_ACTIONS] = {-1, 0, 1, 0}; + static const int d_cols[PATHFINDER_NUM_ACTIONS] = {0, 1, 0, -1}; + while (head < tail) { + int cell = queue[head++]; + int row = cell / PATHFINDER_COLS; + int col = cell % PATHFINDER_COLS; + if (row == s->goal_row && col == s->goal_col) { + return dist[row][col]; + } + + for (int action = 0; action < PATHFINDER_NUM_ACTIONS; action++) { + int nr = row + d_rows[action]; + int nc = col + d_cols[action]; + if (!in_bounds(nr, nc)) { + continue; + } + int wall_idx = wall_idx_between(row, col, nr, nc); + if (wall_idx < 0 || s->true_walls[wall_idx]) { + continue; + } + if (dist[nr][nc] >= 0) { + continue; + } + dist[nr][nc] = dist[row][col] + 1; + queue[tail++] = nr * PATHFINDER_COLS + nc; + } + } + + return -1; +} + +static void update_observations(Pathfinder* env) { + if (env->observations == NULL) { + return; + } + + memcpy(env->observations, env->state.known_walls, + sizeof(float) * PATHFINDER_NUM_WALLS); + env->observations[PATHFINDER_NUM_WALLS] = + (float)env->state.agent_col / (float)(PATHFINDER_COLS - 1); + env->observations[PATHFINDER_NUM_WALLS + 1] = + (float)env->state.agent_row / (float)(PATHFINDER_ROWS - 1); +} + +static inline int clamp_solution_len(int solution_len) { + if (solution_len < 1) { + return 1; + } + if (solution_len > PATHFINDER_MAX_SOLUTION_LEN) { + return PATHFINDER_MAX_SOLUTION_LEN; + } + return solution_len; +} + +static inline int current_target_solution_len(const Pathfinder* env) { + if (!env->curriculum_enabled) { + return PATHFINDER_MAX_SOLUTION_LEN; + } + return clamp_solution_len(env->start_solution_len + env->curriculum_level); +} + +static inline bool curriculum_can_advance(const Pathfinder* env) { + return env->curriculum_enabled && + current_target_solution_len(env) < PATHFINDER_MAX_SOLUTION_LEN; +} + +static void init_walls(State* s) { + memset(s->true_walls, 1, sizeof(s->true_walls)); + for (int i = 0; i < PATHFINDER_NUM_WALLS; i++) { + s->known_walls[i] = PATHFINDER_UNKNOWN; + } +} + +static inline unsigned int pathfinder_rand(Pathfinder* env) { + return rand_u32(env); +} + +static bool carve_solution_recursive( + Pathfinder* env, int row, int col, int remaining_steps, + int step, int visited_count, int* path_rows, int* path_cols, + unsigned char visited[PATHFINDER_ROWS][PATHFINDER_COLS]) { + if (remaining_steps == 0) { + return true; + } + + int max_possible_steps = PATHFINDER_MAX_PATH_CELLS - visited_count; + if (remaining_steps > max_possible_steps) { + return false; + } + + int next_rows[PATHFINDER_NUM_ACTIONS]; + int next_cols[PATHFINDER_NUM_ACTIONS]; + int option_count = 0; + + static const int d_rows[PATHFINDER_NUM_ACTIONS] = {-1, 0, 1, 0}; + static const int d_cols[PATHFINDER_NUM_ACTIONS] = {0, 1, 0, -1}; + for (int action = 0; action < PATHFINDER_NUM_ACTIONS; action++) { + int nr = row + d_rows[action]; + int nc = col + d_cols[action]; + if (!in_bounds(nr, nc) || visited[nr][nc]) { + continue; + } + next_rows[option_count] = nr; + next_cols[option_count] = nc; + option_count++; + } + + if (option_count == 0) { + return false; + } + + for (int i = 0; i < option_count - 1; i++) { + int swap_idx = i + (int)(rand_u32(env) % (unsigned int)(option_count - i)); + int tmp_row = next_rows[i]; + int tmp_col = next_cols[i]; + next_rows[i] = next_rows[swap_idx]; + next_cols[i] = next_cols[swap_idx]; + next_rows[swap_idx] = tmp_row; + next_cols[swap_idx] = tmp_col; + } + + for (int i = 0; i < option_count; i++) { + int nr = next_rows[i]; + int nc = next_cols[i]; + + visited[nr][nc] = 1; + path_rows[step + 1] = nr; + path_cols[step + 1] = nc; + + if (carve_solution_recursive(env, nr, nc, remaining_steps - 1, step + 1, visited_count + 1, + path_rows, path_cols, visited)) { + return true; + } + visited[nr][nc] = 0; + } + return false; +} + +static bool carve_solution(Pathfinder* env, int target_len) { + State* s = &env->state; + if (target_len <= 0) { + target_len = 1; + } + if (target_len > PATHFINDER_MAX_SOLUTION_LEN) { + target_len = PATHFINDER_MAX_SOLUTION_LEN; + } + + unsigned char visited[PATHFINDER_ROWS][PATHFINDER_COLS] = {0}; + int path_rows[PATHFINDER_MAX_PATH_CELLS]; + int path_cols[PATHFINDER_MAX_PATH_CELLS]; + + visited[0][0] = 1; + path_rows[0] = 0; + path_cols[0] = 0; + + if (!carve_solution_recursive(env, 0, 0, target_len, 0, 1, + path_rows, path_cols, visited)) { + return false; + } + + for (int step = 0; step < target_len; step++) { + open_edge(s, path_rows[step], path_cols[step], path_rows[step + 1], path_cols[step + 1]); + } + + s->goal_row = path_rows[target_len]; + s->goal_col = path_cols[target_len]; + return true; +} + +static void open_random_edges(Pathfinder* env, int target_len) { + State* s = &env->state; + float open_prob = env->branch_prob + env->loop_prob; + if (open_prob < 0.0f) open_prob = 0.0f; + if (open_prob > 0.50f) open_prob = 0.50f; + int open_threshold = (int)(open_prob * 256.0f); + unsigned int samples = 0; + int remaining = 0; + + for (int row = 0; row < PATHFINDER_ROWS; row++) { + for (int col = 0; col < PATHFINDER_COLS - 1; col++) { + if (rand_chance_u8(env, open_threshold, &samples, &remaining)) { + int wall_idx = v_wall_idx(row, col + 1); + if (s->true_walls[wall_idx] == 0) { + continue; + } + s->true_walls[wall_idx] = 0; + if (shortest_path(s) < target_len) { + s->true_walls[wall_idx] = 1; + } + } + } + } + for (int row = 0; row < PATHFINDER_ROWS - 1; row++) { + for (int col = 0; col < PATHFINDER_COLS; col++) { + if (rand_chance_u8(env, open_threshold, &samples, &remaining)) { + int wall_idx = h_wall_idx(row + 1, col); + if (s->true_walls[wall_idx] == 0) { + continue; + } + s->true_walls[wall_idx] = 0; + if (shortest_path(s) < target_len) { + s->true_walls[wall_idx] = 1; + } + } + } + } +} + +static void generate_maze(Pathfinder* env) { + State* s = &env->state; + int target_len = current_target_solution_len(env); + + init_walls(s); + bool carved = carve_solution(env, target_len); + if (!carved) { + carved = carve_solution(env, target_len); + } + + if (!carved) { + target_len = 1; + carve_solution(env, target_len); + } + + open_random_edges(env, target_len); + s->shortest_path_len = shortest_path(s); + if (s->shortest_path_len < 0) { + s->shortest_path_len = target_len; + } +} + +static void update_curriculum(Pathfinder* env, int success) { + env->curriculum_episodes++; + if (!success || !curriculum_can_advance(env)) { + return; + } + + env->curriculum_level++; +} + +void add_log(Pathfinder* env) { + State* s = &env->state; + float success = (float)s->success; + int current_curriculum_level = env->curriculum_level; + int current_target_len = current_target_solution_len(env); + int next_target_len = current_target_len; + if (s->success && curriculum_can_advance(env)) { + next_target_len = clamp_solution_len( + env->start_solution_len + env->curriculum_level + 1); + } + float efficiency = 0.0f; + if (s->success && s->agent_path_len > 0 && s->shortest_path_len > 0) { + efficiency = (float)s->shortest_path_len / (float)s->agent_path_len; + if (efficiency > 1.0f) { + efficiency = 1.0f; + } + } + + update_curriculum(env, s->success); + + env->log.perf += success; + env->log.score += success * efficiency; + env->log.episode_return += s->episode_return; + env->log.episode_length += (float)s->tick; + env->log.success += success; + env->log.wins += success; + env->log.wall_hits += (float)s->wall_hits; + env->log.revisits += (float)s->revisit_count; + env->log.known_wall_deaths += (float)s->known_wall_death; + env->log.repeat_move_deaths += (float)s->repeat_move_death; + env->log.shortest_path_len += (float)s->shortest_path_len; + env->log.agent_path_len += (float)s->agent_path_len; + env->log.curriculum_level += (float)current_curriculum_level; + env->log.curriculum_target_len += (float)current_target_len; + env->log.curriculum_next_target_len += (float)next_target_len; + env->log.n += 1.0f; +} + +void puffer_state_refresh(Pathfinder* env) { + update_observations(env); +} + +void init(Pathfinder* env) { + if (env->num_agents == 0) { + env->num_agents = 1; + } +} + +void c_reset(Pathfinder* env) { + State* s = &env->state; + memset(s, 0, sizeof(*s)); + s->agent_row = 0; + s->agent_col = 0; + generate_maze(env); + mark_visited(s, s->agent_row, s->agent_col); + reset_move_history(s); + update_observations(env); +} + +static void reset_attempt(Pathfinder* env) { + State* s = &env->state; + s->tick = 0; + s->agent_row = 0; + s->agent_col = 0; + s->agent_path_len = 0; + s->wall_hits = 0; + s->revisit_count = 0; + s->known_wall_death = 0; + s->repeat_move_death = 0; + s->visited_count = 0; + s->success = 0; + s->episode_return = 0.0f; + memset(s->visited, 0, sizeof(s->visited)); + for (int i = 0; i < PATHFINDER_NUM_WALLS; i++) { + s->known_walls[i] = PATHFINDER_UNKNOWN; + } + mark_visited(s, s->agent_row, s->agent_col); + reset_move_history(s); + update_observations(env); +} + +static void reveal_wall(Pathfinder* env, int wall_idx) { + State* s = &env->state; + if (s->known_walls[wall_idx] != PATHFINDER_UNKNOWN) { + return; + } + s->known_walls[wall_idx] = s->true_walls[wall_idx] ? PATHFINDER_WALL : PATHFINDER_OPEN; +} + +void c_step(Pathfinder* env) { + State* s = &env->state; + env->terminals[0] = 0.0f; + env->rewards[0] = 0.0f; + s->tick++; + + float reward = env->step_penalty; + int action = (int)env->actions[0]; + int d_row; + int d_col; + action_delta(action, &d_row, &d_col); + int next_row = s->agent_row + d_row; + int next_col = s->agent_col + d_col; + int wall_idx = wall_idx_between(s->agent_row, s->agent_col, next_row, next_col); + if (!in_bounds(next_row, next_col)) { + reward += env->impossible_penalty; + env->terminals[0] = 1.0f; + } else { + bool was_known = s->known_walls[wall_idx] != PATHFINDER_UNKNOWN; + reveal_wall(env, wall_idx); + + if (s->true_walls[wall_idx]) { + s->wall_hits++; + reward += was_known ? env->known_wall_death_penalty : env->new_wall_penalty; + if (was_known) { + s->known_wall_death = 1; + env->terminals[0] = 1.0f; + } + } else { + if (repeats_two_cell_cycle(s, next_row, next_col)) { + reward += env->repeat_move_death_penalty; + s->repeat_move_death = 1; + env->terminals[0] = 1.0f; + } else { + bool revisited = s->visited[next_row][next_col] != 0; + s->agent_row = next_row; + s->agent_col = next_col; + s->agent_path_len++; + record_successful_move(s); + if (revisited) { + s->revisit_count++; + reward += env->revisit_penalty; + } else { + mark_visited(s, next_row, next_col); + reward += env->new_cell_reward; + } + if (s->agent_row == s->goal_row && s->agent_col == s->goal_col) { + s->success = 1; + reward += env->goal_reward; + env->terminals[0] = 1.0f; + } + } + } + } + + if (s->tick >= env->max_steps && env->terminals[0] == 0.0f) { + env->terminals[0] = 1.0f; + } + + env->rewards[0] = reward; + s->episode_return += reward; + update_observations(env); + + if (env->terminals[0]) { + int solved = s->success; + add_log(env); + if (solved) { + c_reset(env); + } else { + reset_attempt(env); + } + } +} + +void c_close(Pathfinder* env) { +#if !defined(PATHFINDER_NO_RENDER) && !defined(PUFFER_PERF_NO_RENDER) + if (IsWindowReady()) { + CloseWindow(); + } +#endif + free(env->client); + env->client = NULL; +} + +#if defined(PATHFINDER_NO_RENDER) || defined(PUFFER_PERF_NO_RENDER) +void c_render(Pathfinder* env) { + (void)env; +} +#else +static const Color PATHFINDER_BG = {6, 24, 24, 255}; +static const Color PATHFINDER_CELL_A = {16, 39, 42, 255}; +static const Color PATHFINDER_CELL_B = {19, 46, 49, 255}; +static const Color PATHFINDER_GRID = {54, 84, 86, 255}; +static const Color PATHFINDER_TEXT = {235, 242, 240, 255}; +static const Color PATHFINDER_MUTED = {145, 166, 164, 255}; +static const Color PATHFINDER_TRUE_WALL = {88, 96, 99, 255}; +static const Color PATHFINDER_UNKNOWN_EDGE = {42, 63, 65, 255}; +static const Color PATHFINDER_KNOWN_WALL = {218, 59, 54, 255}; +static const Color PATHFINDER_KNOWN_OPEN = {75, 196, 118, 255}; +static const Color PATHFINDER_AGENT = {0, 187, 187, 255}; +static const Color PATHFINDER_GOAL = {232, 184, 58, 255}; +static const Color PATHFINDER_START = {118, 146, 150, 255}; +static const Color PATHFINDER_VISITED = {0, 187, 187, 42}; + +static PathfinderClient* make_client(Pathfinder* env) { + PathfinderClient* client = (PathfinderClient*)calloc(1, sizeof(PathfinderClient)); + client->show_truth = !env->player_mode; + InitWindow(PATHFINDER_RENDER_WIDTH, PATHFINDER_RENDER_HEIGHT, "PufferLib Pathfinder"); + SetTargetFPS(30); + return client; +} + +static inline int cell_x(int col) { + return PATHFINDER_RENDER_BOARD_X + col * PATHFINDER_RENDER_TILE; +} + +static inline int cell_y(int row) { + return PATHFINDER_RENDER_BOARD_Y + row * PATHFINDER_RENDER_TILE; +} + +static inline Vector2 cell_center(int row, int col) { + return (Vector2){ + (float)(cell_x(col) + PATHFINDER_RENDER_TILE / 2), + (float)(cell_y(row) + PATHFINDER_RENDER_TILE / 2) + }; +} + +static void draw_centered_text(const char* text, int cx, int y, + int font_size, Color color) { + int width = MeasureText(text, font_size); + DrawText(text, cx - width / 2, y, font_size, color); +} + +static void draw_edge(Pathfinder* env, int wall_idx, Vector2 start, Vector2 end) { + State* s = &env->state; + float known = s->known_walls[wall_idx]; + + DrawLineEx(start, end, 2.0f, PATHFINDER_UNKNOWN_EDGE); + if (env->client->show_truth && s->true_walls[wall_idx]) { + DrawLineEx(start, end, 6.0f, PATHFINDER_TRUE_WALL); + } + + if (known == PATHFINDER_WALL) { + DrawLineEx(start, end, 8.0f, PATHFINDER_KNOWN_WALL); + } else if (known == PATHFINDER_OPEN) { + DrawLineEx(start, end, 4.0f, PATHFINDER_KNOWN_OPEN); + } +} + +static void draw_board(Pathfinder* env) { + State* s = &env->state; + + for (int row = 0; row < PATHFINDER_ROWS; row++) { + for (int col = 0; col < PATHFINDER_COLS; col++) { + Color cell_color = ((row + col) & 1) ? PATHFINDER_CELL_A : PATHFINDER_CELL_B; + DrawRectangle(cell_x(col), cell_y(row), + PATHFINDER_RENDER_TILE - 1, PATHFINDER_RENDER_TILE - 1, cell_color); + if (s->visited[row][col]) { + DrawRectangle(cell_x(col) + 8, cell_y(row) + 8, + PATHFINDER_RENDER_TILE - 17, PATHFINDER_RENDER_TILE - 17, PATHFINDER_VISITED); + } + } + } + + DrawRectangleLinesEx((Rectangle){ + (float)PATHFINDER_RENDER_BOARD_X, + (float)PATHFINDER_RENDER_BOARD_Y, + (float)PATHFINDER_RENDER_BOARD_SIZE, + (float)PATHFINDER_RENDER_BOARD_SIZE + }, 2.0f, PATHFINDER_GRID); + + for (int col = 0; col < PATHFINDER_COLS; col++) { + char label[2] = {(char)('A' + col), '\0'}; + draw_centered_text(label, + cell_x(col) + PATHFINDER_RENDER_TILE / 2, + PATHFINDER_RENDER_BOARD_Y - 28, 20, PATHFINDER_TEXT); + } + for (int row = 0; row < PATHFINDER_ROWS; row++) { + DrawText(TextFormat("%i", row + 1), + PATHFINDER_RENDER_BOARD_X - 28, + cell_y(row) + PATHFINDER_RENDER_TILE / 2 - 10, + 20, PATHFINDER_TEXT); + } + + DrawRectangleLinesEx((Rectangle){ + (float)cell_x(0) + 4.0f, + (float)cell_y(0) + 4.0f, + (float)PATHFINDER_RENDER_TILE - 9.0f, + (float)PATHFINDER_RENDER_TILE - 9.0f + }, 2.0f, PATHFINDER_START); + draw_centered_text("A1", cell_x(0) + PATHFINDER_RENDER_TILE / 2, + cell_y(0) + PATHFINDER_RENDER_TILE - 24, 16, PATHFINDER_MUTED); + + for (int row = 0; row < PATHFINDER_ROWS; row++) { + for (int edge_col = 0; edge_col <= PATHFINDER_COLS; edge_col++) { + int wall_idx = v_wall_idx(row, edge_col); + float x = (float)(PATHFINDER_RENDER_BOARD_X + edge_col * PATHFINDER_RENDER_TILE); + float y0 = (float)(cell_y(row) + 7); + float y1 = (float)(cell_y(row + 1) - 7); + draw_edge(env, wall_idx, (Vector2){x, y0}, (Vector2){x, y1}); + } + } + for (int edge_row = 0; edge_row <= PATHFINDER_ROWS; edge_row++) { + for (int col = 0; col < PATHFINDER_COLS; col++) { + int wall_idx = h_wall_idx(edge_row, col); + float x0 = (float)(cell_x(col) + 7); + float x1 = (float)(cell_x(col + 1) - 7); + float y = (float)(PATHFINDER_RENDER_BOARD_Y + edge_row * PATHFINDER_RENDER_TILE); + draw_edge(env, wall_idx, (Vector2){x0, y}, (Vector2){x1, y}); + } + } + + if (env->client->show_truth) { + Vector2 goal = cell_center(s->goal_row, s->goal_col); + DrawCircleV(goal, 19.0f, PATHFINDER_GOAL); + draw_centered_text("T", (int)goal.x, (int)goal.y - 10, 22, PATHFINDER_BG); + } + + Vector2 agent = cell_center(s->agent_row, s->agent_col); + DrawCircleV(agent, 21.0f, PATHFINDER_AGENT); + DrawCircleLines((int)agent.x, (int)agent.y, 22.0f, PATHFINDER_TEXT); + draw_centered_text("P", (int)agent.x, (int)agent.y - 11, 24, PATHFINDER_BG); +} + +static void draw_panel(Pathfinder* env) { + State* s = &env->state; + int x = PATHFINDER_RENDER_BOARD_X + PATHFINDER_RENDER_BOARD_SIZE + 34; + int y = PATHFINDER_RENDER_BOARD_Y; + DrawText("Pathfinder", x, y, 28, PATHFINDER_TEXT); + y += 38; + DrawText(env->client->show_truth ? "View: truth + observation" : "View: observation only", + x, y, 18, env->client->show_truth ? PATHFINDER_GOAL : PATHFINDER_KNOWN_OPEN); + y += 34; + + DrawText(TextFormat("Position: %c%i", 'A' + s->agent_col, s->agent_row + 1), + x, y, 20, PATHFINDER_TEXT); + y += 26; + if (env->client->show_truth) { + DrawText(TextFormat("Target: %c%i", 'A' + s->goal_col, s->goal_row + 1), + x, y, 20, PATHFINDER_GOAL); + } else { + DrawText("Target: hidden", x, y, 20, PATHFINDER_MUTED); + } + y += 34; + + y += 10; + + DrawText(TextFormat("Wall deaths: %i", s->wall_hits), x, y, 18, PATHFINDER_TEXT); + y += 24; + DrawText(TextFormat("Known-wall deaths: %.0f", env->log.known_wall_deaths), + x, y, 18, PATHFINDER_KNOWN_WALL); + y += 24; + DrawText(TextFormat("Repeat-move deaths: %.0f", env->log.repeat_move_deaths), + x, y, 18, PATHFINDER_KNOWN_WALL); + y += 24; + DrawText(TextFormat("Wins: %.0f", env->log.wins), + x, y, 18, PATHFINDER_KNOWN_OPEN); + + DrawText("Arrows/WASD move | R reset", PATHFINDER_RENDER_BOARD_X, + PATHFINDER_RENDER_HEIGHT - 30, 18, PATHFINDER_MUTED); + DrawText(env->player_mode + ? "SPACE random | TAB locked off | ESC quit" + : "TAB view | SPACE random | ESC quit", + PATHFINDER_RENDER_BOARD_X + 310, PATHFINDER_RENDER_HEIGHT - 30, + 18, PATHFINDER_MUTED); +} + +void c_render(Pathfinder* env) { + if (!IsWindowReady()) { + env->client = make_client(env); + } else if (env->client == NULL) { + env->client = (PathfinderClient*)calloc(1, sizeof(PathfinderClient)); + env->client->show_truth = !env->player_mode; + } + if (env->player_mode) { + env->client->show_truth = false; + } + + if (IsKeyDown(KEY_ESCAPE)) { + c_close(env); + exit(0); + } + if (IsKeyPressed(KEY_TAB) && !env->player_mode) { + env->client->show_truth = !env->client->show_truth; + } + + BeginDrawing(); + ClearBackground(PATHFINDER_BG); + DrawText("Milton Bradley Pathfinder", PATHFINDER_RENDER_BOARD_X, 26, 30, PATHFINDER_TEXT); + DrawText(env->player_mode + ? "Red = known wall, green = known open" + : "Red = known wall, green = known open, gray = true hidden wall", + PATHFINDER_RENDER_BOARD_X, 60, 18, PATHFINDER_MUTED); + draw_board(env); + draw_panel(env); + EndDrawing(); +} +#endif