diff --git a/BAT9_SWEEP_ANALYSIS.md b/BAT9_SWEEP_ANALYSIS.md new file mode 100644 index 0000000000..54d848c97d --- /dev/null +++ b/BAT9_SWEEP_ANALYSIS.md @@ -0,0 +1,450 @@ +# Bat9 Sweep Analysis + +Date: 2026-06-11 + +This note summarizes the local W&B `bat9` sweep after the timer observation, +sweepable ear directivity, and bug wing sideband changes. It is modeled after +`BAT8_SWEEP_ANALYSIS.md` and should be used before copying a Bat9 candidate into +`config/bat.ini`. + +## Future Agent Workflow + +Use this workflow when trying a Bat9 candidate. + +1. Pick by behavior objective, not only by `env/perf`. + - Start with `ifbn2epd` for the first balanced retrain/video candidate. + - Keep `ewgh6l5l` as the best composite scalar anchor. + - Use `qkwrqhzy` when SPS matters, `sfmk59n1` or `w938us46` for high + curriculum checks, and `cpx4gj2f` for a 128x5 balanced comparison. + - Treat `1a2s8uvf` as a low-chirp experiment only; its timeout rate is high. + +2. Pull exact hyperparameters from the local W&B config. + - Use `wandb/run-*-/files/config.yaml` or `logs/bat/.json`. + - Copy only concrete values from `vec`, `policy`, `env`, and `train`. + - Do not copy sweep search-space sections. + - Keep the run's configured `train.total_timesteps`. + - Local JSON/YAML files do not always store W&B display names. When the name + matters, query W&B with the run hash, for example: + `wandb.Api().run("kinvert-k/bat9/").name`. + +3. Before each candidate train/eval cycle, run: + + ```bash + source .venv/bin/activate && ./build.sh bat && bash ocean/bat/tests/run_all.sh + ``` + +4. Train with the selected config and no timestep override: + + ```bash + source .venv/bin/activate && python -m pufferlib.pufferl train bat --train.gpus 1 + ``` + + If CUDA is hidden inside Codex, rerun the same command outside the sandbox or + escalated. Do not switch Bat to CPU. + +5. Run fixed-level visual evals before adopting defaults: + + ```bash + timeout 45s bash -lc 'source .venv/bin/activate && DISPLAY=:0 python -m pufferlib.pufferl eval bat --load-model-path latest --env.curriculum-initial-level 5 --env.curriculum-successes-per-level 1000000' + timeout 45s bash -lc 'source .venv/bin/activate && DISPLAY=:0 python -m pufferlib.pufferl eval bat --load-model-path latest --env.curriculum-initial-level 10 --env.curriculum-successes-per-level 1000000' + ``` + +6. Record the first postable MP4 only after a retrained checkpoint looks clean: + + ```bash + timeout 45s bash -lc 'source .venv/bin/activate && DISPLAY=:0 python -m pufferlib.pufferl eval bat --load-model-path latest --env.curriculum-initial-level 5 --env.curriculum-successes-per-level 1000000 --env.record-video 1 --env.record-video-fps 30 --env.record-video-seconds 30 --env.record-video-audio 1' + ``` + + Expected output is `recordings/bat_recording.mp4`. Do not commit recordings, + gifs, local W&B folders, logs, or checkpoint artifacts unless asked. + +## Source And Filter + +Source data is the local `wandb/` tree in `/home/claude/pathfinder`, filtered to +runs where the W&B metadata/config has `--wandb-project bat9` or +`wandb_project = bat9`. + +- Sweep invocation in run metadata: + `python -m pufferlib.pufferl ... --sweep.gpus 1 --train.gpus 1 --sweep.use-gpu "" --sweep.max-runs 1000 --wandb --wandb-project bat9` +- Git commit in run metadata: `ac61d3bfebb5c24c6a0703c3998940904df0a140` +- Hardware in run metadata: `G240`, `NVIDIA GeForce RTX 5060` +- Bat9 rows with `env/perf` and usable W&B config: `789` +- Rows with `env/perf >= 0.25`: `507` +- Pareto front rows over the selected objectives: `123` + +The previous handoff mentioned `773` complete runs. The local tree had grown by +the time this snapshot was frozen; the top-six ordering remained unchanged. + +`env/perf` is the composite sweep objective: + +```text +perf = base_perf * curriculum_difficulty * chirp_perf +``` + +This is still not the same as "best visible behavior." High scalar scores can +come from low chirp counts, curriculum progress, or catch rate in different +proportions. + +## Bat9 Code Changes + +Bat9 differs from Bat8 in three behavior-relevant ways: + +- Timer observation: `BAT_OBS_SIZE` is now `41`, and observation slot `40` + receives normalized elapsed episode time. This should give the policy urgency + information that Bat8 lacked. +- Ear directivity: `ear_rear_gain`, `ear_front_gain`, and `ear_side_gain` are + sweepable, and the echo scheduler mixes rear baseline, forward response, and + left/right side response into per-ear intensity. +- Bug wing sidebands: bug echoes add adjacent frequency-bin sideband energy + scaled by `bug_wing_sideband_gain`. + +Bat9 also logs `env/mean_chirp_bandwidth`, which helps distinguish detection +benefits from policies that merely chirp broadly or noisily. + +## Overall Distribution + +Across all 789 complete Bat9 rows: + +| `env/perf` quantile | value | +| ---: | ---: | +| min | `0.0000` | +| 25% | `0.2023` | +| 50% | `0.2996` | +| 75% | `0.3628` | +| 90% | `0.4014` | +| 95% | `0.4274` | +| 99% | `0.4861` | +| max | `0.5565` | + +Filtered high-perf rows (`env/perf >= 0.25`) look like this: + +| metric | mean | median | q25 | q75 | min | max | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | +| `env/perf` | `0.3477` | `0.3459` | `0.3058` | `0.3843` | `0.2505` | `0.5565` | +| `env/base_perf` | `0.9173` | `0.9237` | `0.9028` | `0.9427` | `0.5768` | `0.9741` | +| `env/curriculum_perf` | `0.8004` | `0.8161` | `0.7841` | `0.8350` | `0.4398` | `0.9327` | +| `env/chirp_perf` | `0.4452` | `0.4417` | `0.3936` | `0.4877` | `0.3011` | `0.6888` | +| `env/curriculum_level` | `11.94` | `11.99` | `11.12` | `12.83` | `8.05` | `15.62` | +| `env/chirps_emitted` | `8.33` | `8.38` | `7.69` | `9.11` | `4.67` | `10.50` | +| `env/chirp_overlap_fraction` | `0.1249` | `0.1069` | `0.0510` | `0.1742` | `0.0001` | `0.5243` | +| `env/mean_chirp_bandwidth` | `0.4071` | `0.3750` | `0.3356` | `0.4556` | `0.0010` | `0.9753` | +| `env/timeout` | `0.0023` | `0.0011` | `0.0004` | `0.0025` | `0.0000` | `0.0521` | +| `env/collision` | `0.0804` | `0.0737` | `0.0549` | `0.0962` | `0.0182` | `0.4232` | +| `bad_terminal` | `0.0827` | `0.0763` | `0.0573` | `0.0972` | `0.0259` | `0.4232` | +| `env/episode_length` | `251.18` | `249.34` | `224.93` | `272.49` | `153.72` | `420.04` | +| `SPS` | `1.09M` | `0.99M` | `0.44M` | `1.63M` | `0.40M` | `2.45M` | + +## Bat8 To Bat9 Read + +This is a qualitative before/after, not a matched statistical test. The Bat8 +numbers are from `BAT8_SWEEP_ANALYSIS.md` plus the same local W&B scan for +episode length. + +| high-perf metric | Bat8 mean | Bat9 mean | read | +| --- | ---: | ---: | --- | +| `env/perf` | `0.3250` | `0.3477` | Bat9 shifted the upper half upward, though Bat8's single best scalar was slightly higher (`0.5695` vs `0.5565`). | +| `env/base_perf` | `0.8592` | `0.9173` | Clear catch-rate improvement. | +| `env/curriculum_perf` | `0.7583` | `0.8004` | Bat9 reaches harder behavior more consistently. | +| `env/chirp_perf` | `0.4591` | `0.4452` | Slightly worse chirp efficiency; Bat9 spends a bit more chirp budget. | +| `env/curriculum_level` | `11.10` | `11.94` | Curriculum level improved by about `0.85`. | +| `env/chirps_emitted` | `8.12` | `8.33` | Small increase in chirp usage. | +| `env/chirp_overlap_fraction` | `0.1580` | `0.1249` | Overlap improved. | +| `env/timeout` | `0.0020` | `0.0023` | No aggregate timeout win from the timer observation. | +| `env/collision` | `0.1388` | `0.0804` | Large collision reduction. | +| `env/episode_length` | `214.93` | `251.18` | Episodes got longer, so the timer did not simply make policies rush. | +| `SPS` | `1.63M` | `1.09M` | Bat9 is slower, mostly because many good runs are wider models. | + +Timer read: the timer observation did not remove timeout/circling risk by +itself. In the top six, `1a2s8uvf` still times out at `0.0512`, `qkwrqhzy` at +`0.0216`, and `ewgh6l5l`/`gli5dke9` have long episodes. Fixed-level visual eval +is still required before declaring a default. + +## Top Composite Runs + +| run id | role | perf | base | curriculum | chirp perf | level | chirps | timeout | collision | episode len | SPS | model | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| `rdjj5r21` | silent inbound exploit | `0.8438` | `0.9638` | `0.8450` | `0.9985` | `10.97` | `0.02` | `0.0282` | `0.0079` | `468.48` | `2.62M` | 64x5 | +| `ewgh6l5l` | best composite | `0.5565` | `0.9497` | `0.8392` | `0.6544` | `11.30` | `5.19` | `0.0088` | `0.0414` | `378.95` | `1.13M` | 256x5 | +| `ifbn2epd` | first retrain/video pick | `0.5398` | `0.9639` | `0.8524` | `0.6372` | `11.11` | `5.44` | `0.0051` | `0.0310` | `303.99` | `2.15M` | 64x5 | +| `gli5dke9` | high scalar, slow wide model | `0.5371` | `0.9459` | `0.8363` | `0.6448` | `11.54` | `5.33` | `0.0139` | `0.0402` | `420.04` | `0.46M` | 512x5 | +| `sfmk59n1` | high curriculum top-six | `0.5314` | `0.9285` | `0.9213` | `0.5840` | `13.51` | `6.24` | `0.0014` | `0.0701` | `263.42` | `0.98M` | 256x5 | +| `1a2s8uvf` | lowest chirp top-six, timeout risk | `0.5243` | `0.9048` | `0.7648` | `0.6888` | `10.47` | `4.67` | `0.0512` | `0.0441` | `362.04` | `1.07M` | 256x5 | +| `qkwrqhzy` | fastest top-six | `0.5145` | `0.9227` | `0.8075` | `0.6389` | `11.05` | `5.42` | `0.0216` | `0.0557` | `330.63` | `2.45M` | 64x4 | +| `63dl6lpc` | low-chirp comparison | `0.4975` | `0.9138` | `0.7219` | `0.6864` | `10.03` | `4.70` | `0.0447` | `0.0415` | `379.87` | `1.32M` | 256x4 | +| `cpx4gj2f` | balanced 128x5 candidate | `0.4968` | `0.9522` | `0.8269` | `0.6070` | `10.96` | `5.89` | `0.0119` | `0.0360` | `309.02` | `1.72M` | 128x5 | + +## Pursuit-Biased Short-Episode Candidates + +This pass looks for high `env/perf` with lower `env/episode_length`, under the +working hypothesis that shorter successful episodes are more likely to be active +pursuit than waiting/intercept behavior. This is only a proxy: very short +episodes can also mean fast collisions, so the best candidates below keep +`base_perf` high and avoid large timeout/collision rates. + +Across the current local Bat9 logs, `env/perf >= 0.40` has median episode length +`248.23` and q25 `223.10`. The rows below are the most promising pursuit-biased +visual candidates, with W&B names resolved from `kinvert-k/bat9` on 2026-06-11. + +| hash | W&B name | why inspect | perf | episode len | base | level | chirps | timeout | collision | SPS | model | +| --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| `o7yrj371` | `vivid-breeze-268` | Best first pursuit candidate: high perf, short episode, low timeout/collision, and fast 64x5 model. | `0.4749` | `221.67` | `0.9323` | `13.21` | `6.29` | `0.0008` | `0.0670` | `1.99M` | 64x5 | +| `wxyb10fq` | `happy-shadow-619` | Highest short-episode candidate under ~230 steps; high level and good SPS. | `0.4846` | `225.07` | `0.9237` | `13.99` | `6.12` | `0.0003` | `0.0760` | `1.70M` | 128x5 | +| `rm3a29ie` | `generous-violet-224` | Highest `perf / episode_length` among the near-0.485 perf group; moderate collision risk. | `0.4847` | `226.97` | `0.9144` | `12.91` | `5.90` | `0.0014` | `0.0842` | `1.00M` | 256x5 | +| `x1ayhg3j` | `clean-pyramid-454` | Strong pursuit-ratio candidate: ~199-step episodes with good base and low timeout. | `0.4458` | `199.11` | `0.9234` | `12.90` | `6.72` | `0.0005` | `0.0761` | `0.99M` | 256x5 | +| `zfxopb9j` | `vocal-snowflake-675` | Similar to `x1ayhg3j`, slightly higher perf and slightly longer episode; collision is higher but not extreme. | `0.4535` | `203.92` | `0.9091` | `13.39` | `6.48` | `0.0013` | `0.0896` | `0.92M` | 256x5 | +| `e4ut00v8` | `gentle-wind-710` | Cleaner terminal profile: zero timeout, low collision, high base, short-ish episode. | `0.4365` | `218.73` | `0.9472` | `13.78` | `7.14` | `0.0000` | `0.0528` | `1.66M` | 128x5 | +| `op9q6evk` | `sparkling-plasma-859` | Low collision and high base; slower 512x5 model, but a useful clean-pursuit comparison. | `0.4301` | `215.24` | `0.9408` | `13.60` | `7.21` | `0.0005` | `0.0586` | `0.44M` | 512x5 | +| `vt8s8kok` | `golden-moon-129` | Good 128x5 speed/behavior balance with sub-200 episode length and low timeout. | `0.4274` | `198.50` | `0.9297` | `13.58` | `7.14` | `0.0002` | `0.0701` | `1.79M` | 128x5 | +| `tuz0bo8d` | `youthful-lake-438` | Short, low-timeout 128x5 backup; lower perf than the rows above. | `0.4211` | `198.97` | `0.9330` | `13.59` | `7.33` | `0.0001` | `0.0669` | `1.46M` | 128x5 | + +Risky short-episode rows to treat with caution: + +| hash | W&B name | caution | perf | episode len | base | level | chirps | timeout | collision | model | +| --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| `d46xhryw` | `hopeful-universe-874` | Very short and high level, but lower base and high collision; may be fast contact/failure rather than clean pursuit. | `0.4608` | `178.85` | `0.8634` | `15.30` | `6.66` | `0.0000` | `0.1366` | 512x5 | +| `l2sg0cpf` | `scarlet-butterfly-228` | Shortest high-perf episode length, but collision is too high for first visual pass. | `0.4573` | `153.72` | `0.8273` | `15.08` | `6.14` | `0.0001` | `0.1726` | 256x5 | + +## Distance-Tempo Chirp Candidates + +Bat logs `env/far_chirp_rate`, `env/near_chirp_rate`, and +`env/chirp_tempo_ratio`. These are distance-region metrics, not strict +episode-time buckets: "far" means the bat-bug distance is greater than `0.66` +of the start distance, and "near" means less than `0.33`. The tempo ratio is +`near_chirp_rate / far_chirp_rate`, capped at `10`. + +This shortlist looks for runs that may chirp sparsely while far from the bug +and chirp faster once close. The strict filter was `perf >= 0.40`, +`base_perf >= 0.90`, `timeout <= 0.01`, and `collision <= 0.10`; rows were then +ranked by high tempo ratio, low far rate, enough near rate, later mean chirp +time, lower chirp count, and scalar perf. This pass used `logs/bat/*.json`, +which currently has `907` Bat9 rows with tempo metrics. Display names were +resolved from W&B (`kinvert-k/bat9`) on 2026-06-11. + +| hash | W&B name | why inspect | perf | episode len | chirps | far rate | near rate | tempo ratio | mean chirp tick | timeout | collision | SPS | model | +| --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| `x1ayhg3j` | `clean-pyramid-454` | Best first practical eval: short episodes, good perf/base, high near rate, and strong `1.74x` close/far tempo. Level 10 visual review showed a one/few-chirp blind-intercept tactic, not simply continuous close-range chirping. | `0.4458` | `199.11` | `6.72` | `0.0291` | `0.0480` | `1.74` | `0.210` | `0.0005` | `0.0761` | `0.99M` | 256x5 | +| `agdoug04` | `unique-dawn-764` | Best composite sparse-far/fast-near row; far chirp rate is very low, but episodes are long. | `0.4565` | `352.20` | `6.00` | `0.0213` | `0.0298` | `1.80` | `0.362` | `0.0035` | `0.0813` | `2.16M` | 128x5 | +| `ks16xv58` | `ancient-pyramid-609` | Extreme tempo-ratio study: almost no far chirping and very late mean chirp time; slower 512x5 and lower perf. | `0.4036` | `354.60` | `7.29` | `0.0088` | `0.0387` | `4.92` | `0.497` | `0.0060` | `0.0663` | `0.44M` | 512x5 | +| `899idvcg` | `giddy-eon-856` | Balanced candidate with short-ish episodes, low overlap, and clean terminal stats. | `0.4258` | `248.10` | `7.08` | `0.0262` | `0.0405` | `1.57` | `0.260` | `0.0042` | `0.0708` | `1.46M` | 64x5 | +| `m6vsxc7b` | `balmy-glitter-608` | Clean terminal profile and high base; useful if the shorter `x1ayhg3j` behavior is too noisy. | `0.4161` | `251.40` | `7.51` | `0.0266` | `0.0416` | `1.60` | `0.262` | `0.0001` | `0.0520` | `0.91M` | 256x5 | +| `z6mh0t3b` | `zesty-oath-412` | Highest scalar among the cleaner tempo candidates, but the tempo ratio is milder. | `0.4757` | `247.50` | `6.46` | `0.0254` | `0.0313` | `1.28` | `0.250` | `0.0015` | `0.0443` | `1.06M` | 256x5 | +| `63dl6lpc` | `jolly-night-696` | Risky extreme: `9.31x` tempo ratio and only 4.70 chirps, but timeout is high and episodes are long. | `0.4975` | `379.87` | `4.70` | `0.0008` | `0.0375` | `9.31` | `0.629` | `0.0447` | `0.0415` | `1.32M` | 256x4 | +| `gli5dke9` | `efficient-totem-502` | High perf with high tempo ratio, but very long/slow 512x5 and timeout is above the clean filter. | `0.5371` | `420.04` | `5.33` | `0.0126` | `0.0276` | `3.04` | `0.492` | `0.0139` | `0.0402` | `0.46M` | 512x5 | + +## Silent Outlier + +`rdjj5r21` / `atomic-dragon-816` stands far outside the rest of Bat9. In the +current local logs, it ranks 1st of 930 Bat9 rows by `env/perf` (`0.8438`) while +averaging only `0.022` chirps per episode. The next-lowest-chirp high-perf rows +are around `4.7` chirps. It also has the 3rd-lowest collision rate (`0.0079`) +and top-quartile base success (`0.9638`), but it is extremely slow: episode +length `468.48`, 4th-longest in the local Bat9 set, with timeout `0.0282`. + +Metric interpretation: this is not a close-range tempo-chirp policy. Its +`first_chirp_tick_norm` and `mean_chirp_tick_norm` are both about `0.996`, which +mostly means "no chirp happened" under the current logging convention. The high +score comes from combining high success and high curriculum difficulty with +near-perfect `chirp_perf`. + +Visual eval after retraining from the exact `rdjj5r21` hyperparameters confirmed +the exploit. The run loaded `checkpoints/bat/1781207491807/0000000034340864.bin` +via `--load-model-path latest`. Per human review, the bat mostly circles, almost +never chirps, and appears to wait for the inbound bug to hit it accidentally. +This explains the very high scalar score and very long episodes: it is not a +usable pursuit policy. + +Physics implication: the policy is exploiting the level 8+ inbound bug +curriculum and timer/motion priors rather than echolocation. At inbound levels, +the bug is re-aimed toward the bat every tick with noise and optional lateral +maneuver. A near-silent policy can therefore learn a wait/patrol strategy that +avoids collisions and catches the bug late. The observation does not include +direct bug position: it contains echo bins, chirp state/cooldown, speed, turn +rate, and timer. This run should remain documented as a useful failure case, not +as a default or video candidate. + +## Physics Knob Analysis + +Spearman correlations across all 789 complete rows: + +| pair | rho | +| --- | ---: | +| `perf` vs `chirps_emitted` | `-0.777` | +| `perf` vs `chirp_perf` | `0.776` | +| `perf` vs `far_chirp_rate` | `-0.678` | +| `perf` vs `chirp_overlap_fraction` | `-0.589` | +| `perf` vs `curriculum_perf` | `0.543` | +| `perf` vs `curriculum_level` | `0.508` | +| `perf` vs `bad_terminal` | `-0.443` | +| `perf` vs `base_perf` | `0.443` | +| `perf` vs `collision` | `-0.377` | +| `perf` vs `timeout` | `-0.300` | + +Wing sideband read: + +- `bug_wing_sideband_gain` has a weak positive relationship with `perf` + (`rho = 0.158`), stronger with `curriculum_perf` (`rho = 0.220`) and + `curriculum_level` (`rho = 0.237`). +- Top-quartile sideband gain hit `env/perf >= 0.25` in `69.2%` of rows versus + `51.0%` in the bottom quartile. +- It does not look like sidebands merely encourage broad noisy chirps: + correlation with `mean_chirp_bandwidth` is slightly negative (`rho = -0.078`), + and correlation with `chirps_emitted` is also slightly negative (`rho = -0.064`). + +Ear directivity read: + +- Raw ear gain effects are weak. `ear_rear_gain` is mildly positive for `perf` + (`rho = 0.071`), while `ear_front_gain` and `ear_side_gain` are near zero. +- Lower `front_to_rear` ratios look better: bottom-quartile hit rate is `70.7%` + versus `60.1%` for the top quartile. +- High `ear_side_gain` correlates with worse collision (`rho = 0.175`) and lower + `base_perf` (`rho = -0.170`), but the hit-rate split is flat. Treat this as a + weak caution, not a rule. +- `ear_separation_scale` has weak positive `perf` signal and high-perf IQR + around `1.73..1.99`. + +Other sweep reads: + +- `reflector_strength` is now beneficial at the higher end. Top quartile hit + rate is `81.3%` versus `53.6%` in the low quartile, unlike the Bat8 low-strength + preference. +- `horizon = 64` remains the only reliable setting. The few `128`/`256` rows are + mostly failures. +- `num_layers = 5` is still the default region; `num_layers = 4` can work for + speed (`qkwrqhzy`), while shallower models are under-sampled or weak. +- `hidden_size = 128` and `256` have the best hit rates, but the top Pareto run + is a 64x5 model. Use model size as a speed/behavior tradeoff, not a hard rule. + +ExtraTrees feature-importance sanity check ranked these as the top predictors of +`env/perf`: `train.beta1`, `train.clip_coef`, `train.ent_coef`, +`train.vf_clip_coef`, `env.curriculum_successes_per_level`, `env.bat_max_speed`, +`train.prio_beta0`, `env.chirp_cooldown_ticks`, `env.reflector_strength`, and +`policy.hidden_size`. Treat this as nonlinear importance, not causal proof. + +## Candidate Shortlist + +Display names below were resolved from W&B (`kinvert-k/bat9`) on 2026-06-11. + +| hash | W&B name | use when | human / visual notes | analysis notes | +| --- | --- | --- | --- | --- | +| `rdjj5r21` | `atomic-dragon-816` | failure-mode study for silent inbound exploit | Per human visual review after retrain: it mostly circles and almost never chirps, apparently waiting for the inbound bug to accidentally hit the bat. Interesting as a scalar exploit, but not a usable pursuit policy. | Rank 1 local Bat9 scalar outlier: `0.8438` perf, `0.9638` base, `0.8450` curriculum perf, `0.9985` chirp perf, `0.022` chirps, `468.48` episode length, `0.0282` timeout, and `0.0079` collision. Fresh checkpoint `1781207491807` reproduced the scalar profile and loaded `checkpoints/bat/1781207491807/0000000034340864.bin` via `--load-model-path latest`. This points to the inbound bug retargeting policy as a curriculum exploit source. | +| `ifbn2epd` | `super-wind-258` | first retrain and video attempt; behavior-strategy study | Per human visual review: performed poorly overall, but learned a very interesting speed-gated chirp strategy. It flies around most of the time presumably at minimum speed, accelerates presumably to max speed just before chirping, then slows right back down after the chirp. | Chosen first because it had the best Pareto score, `0.5398` perf, `0.9639` base, low collision, low overlap, and 2.15M SPS. The speed-before-chirp pattern is worth preserving as a discovered tactic even if this run is not the final default. | +| `ewgh6l5l` | `distinctive-surf-293` | current default candidate | Per human visual review: normal/low-level eval looked erratic, often spun in place, and every watched run appeared to time out. Retesting fixed level 10 showed the desired harder-level tactic: chirp to infer where the bug is going, move to that future path, then circle/wait there until the bug reaches the bat. A fresh 2026-06-11 retrain and level 10 eval confirmed this is the behavior we want as the current default. | Top non-silent Bat9 scalar profile: `0.5565` perf, `0.9497` base, `0.8392` curriculum perf, `0.6544` chirp perf, level `11.30`, `5.19` chirps, episode length `378.95`, timeout `0.0088`, and collision `0.0414`. Fresh checkpoint `1781208977022` reproduced the scalar profile and loaded `checkpoints/bat/1781208977022/0000000033554432.bin` via `--load-model-path latest`. | +| `qkwrqhzy` | `earnest-galaxy-621` | behavior-strategy study; not default yet | Per human visual review: weak overall, about 25% wins in watched eval, and poor at levels 0-1. Surprisingly more interesting on harder eval: it often chirps enough to infer where the bug is going, moves ahead of the bug, then circles until the bug reaches the bat. | 64x4 speed candidate with `0.5145` sweep perf and strong SPS. Fresh checkpoint `1781200339036` reproduced the scalar profile, but visual robustness was too low. Preserve the intercept-and-wait tactic as a discovered behavior, but do not use as default/video pick without fixing low-level competence. | +| `sfmk59n1` | `cool-snowflake-484` | sparse-chirp memory/navigation behavior study | Per human visual review at fixed level 10: very interesting deliberate sparse-chirp strategy. It can fly for long periods without chirping, apparently remembering what it saw from an earlier chirp, navigating around, looking, then chirping again later. It does not look like the previous aimless circling/intercept pattern. Visual perf did not look obviously high, but the behavior is important. | Chosen next because it is the strongest not-yet-watched high-curriculum candidate: `0.5314` perf, `0.9213` curriculum perf, level `13.51`, only `0.0014` timeout, `6.24` chirps, and very low overlap (`0.0070`). It is not another low-chirp/intercept candidate; it trades a moderate collision rate (`0.0701`) and 256x5 speed cost (`0.98M` SPS) for cleaner curriculum progress. Physics read: max sideband gain (`0.25`), high directivity gains (`rear 0.30`, `front 0.678`, `side 0.520`; front/rear `2.26`), slow max speed (`12.91`) with high turn rate (`9.10`). | +| `o7yrj371` | `vivid-breeze-268` | pursuit-biased/default candidate | Per human visual review at fixed level 10: performs well and actively pursues the bugs. This is the first watched high-perf, short-episode candidate that visually supports the pursuit hypothesis rather than the previously observed intercept-and-wait behavior. | Chosen from the high-perf, short-episode screen: `0.4749` perf, `221.67` episode length, `0.9323` base, level `13.21`, near-zero timeout (`0.0008`), moderate collision (`0.0670`), and fast 64x5 throughput (`1.99M` SPS). Fresh checkpoint `1781204865675` was trained from the exact `o7yrj371` hyperparameters and eval loaded `checkpoints/bat/1781204865675/0000000032768000.bin` via `--load-model-path latest`. | +| `x1ayhg3j` | `clean-pyramid-454` | blind-map behavior evidence | Per human visual review after fresh retrain: it does initial chirps, builds an apparent mental map of where the bug is going, then deliberately flies where it believes is right, often continuing blindly after the map is made. Interesting behavior, but not the selected default because the desired current default is the stronger wait-at-predicted-path tactic from `ewgh6l5l`. | Fresh checkpoint `1781208510323` reproduced the original scalar profile: `0.446` perf, `0.923` base, `0.821` curriculum perf, `199.11` episode length, `6.72` chirps, `1.735` tempo ratio, near-zero timeout, and collision `0.076`. Preserve as behavior evidence and comparison point. | +| `cpx4gj2f` | `sleek-smoke-681` | balanced 128x5 comparison | Needs visual review; useful if 64-wide `super-wind-258` looks brittle. | `0.9522` base, low bad terminal `0.0478`, 1.72M SPS; not Pareto-front by the selected objective mix. | +| `1a2s8uvf` | `good-valley-684` | low-chirp experiment | Needs visual review specifically for timeout behavior. | Best top-six chirp perf (`0.6888`) and 4.67 chirps, but timeout is high at `0.0512`; not a default without strong visual evidence. | +| `w938us46` | `fanciful-shape-202` | high-level stress check | Needs visual review; use for stress behavior rather than first video. | Highest level (`15.62`) while still `0.4501` perf; collision `0.1285`. | + +## Eval Notes And Video Pick + +Human visual note for `ifbn2epd` / `super-wind-258`: it performed poorly overall, +but learned a notable speed-gated chirp tactic. It appears to cruise at minimum +speed, accelerate sharply just before a chirp, then slow back down after chirping. +This is important behavior evidence and should be preserved even if the run is +not adopted as a default. + +Human visual note for `ewgh6l5l` / `distinctive-surf-293`: after retraining from +the sweep config into checkpoint `1781199673401` and running normal/low-level +eval, the policy looked erratic, often spun in place, and every watched run +appeared to time out. A later fixed level 10 retest using `--load-model-path +latest` showed the desired harder-level behavior: it chirps to infer where the +bug is going, moves onto that future path, then circles/waits there until the bug +reaches the bat. This is the same broad intercept-and-wait tactic later observed +in `qkwrqhzy`, but `ewgh6l5l` has the stronger scalar profile. + +Fresh default retrain on 2026-06-11 set `config/bat.ini` to the exact concrete +`ewgh6l5l` hyperparameters and produced checkpoint +`checkpoints/bat/1781208977022/0000000033554432.bin`. The scalar profile again +matched the sweep: `0.556` perf, `0.950` base, `0.839` curriculum perf, level +`11.298`, `5.191` chirps, `378.948` episode length, timeout `0.009`, collision +`0.041`, and chirp tempo ratio `0.134`. Human level-10 visual review confirmed +this is the intended current default: it chooses a place on the predicted bug +path and waits/intercepts there. + +Human visual note for `o7yrj371` / `vivid-breeze-268`: after training from the +exact sweep hyperparameters into checkpoint `1781204865675`, fixed level 10 eval +loaded with `--load-model-path latest` showed good performance and active bug +pursuit. This is the cleanest visual support so far for the short-episode screen: +it looked like it chased the bugs rather than mainly waiting on a predicted +intercept point. + +Human visual note for `x1ayhg3j` / `clean-pyramid-454`: after training from the +exact sweep hyperparameters into checkpoint `1781206232504`, normal eval and +fixed level 10 eval loaded `checkpoints/bat/1781206232504/0000000035651584.bin` +with `--load-model-path latest`. Scalar profile reproduced the sweep pattern: +about `0.446` perf, `199` episode length, `6.72` chirps, far chirp rate `0.029`, +near chirp rate `0.048`, and tempo ratio about `1.74`. Level 10 visual review +showed that this should not be interpreted as simply chirping more continuously +when close. It sometimes chirps, builds an apparent internal estimate of the bug +trajectory/map, then flies an intercept course with no more chirps. It works +roughly half the time in watched attempts and looks like a confident blind +intercept strategy. + +Human visual note for `qkwrqhzy` / `earnest-galaxy-621`: after retraining from +the sweep config into checkpoint `1781200339036`, normal eval and fixed level 10 +eval showed weak overall win rate, roughly 25% in the watched sample, and poor +behavior at levels 0-1. The run nevertheless learned a notable harder-level +strategy: chirp enough to infer the bug trajectory, position itself ahead of the +bug, then circle/intercept until the bug reaches the bat. This is valuable +behavior evidence but not a default-quality policy. + +Human visual note for `sfmk59n1` / `cool-snowflake-484`: after setting +`config/bat.ini` to the exact concrete `sfmk59n1` hyperparameters, the policy was +trained into checkpoint `checkpoints/bat/1781203704964/0000000036175872.bin`. +The fresh run reproduced the sweep-scale scalar profile: final `env/perf` about +`0.531`, `base_perf` about `0.929`, `curriculum_perf` about `0.921`, curriculum +level `13.507`, `6.24` chirps, timeout about `0.001`, and collision about +`0.070`. Fixed level 10 visual review showed an important sparse-chirp +memory/navigation tactic: it flies deliberately for long periods without +chirping, apparently using information remembered from an earlier chirp, +navigates around while looking, then chirps again later. It did not look like the +previous aimless circling/intercept behavior. Visual performance did not look +obviously high, but this behavior should be preserved as evidence of memory-like +navigation under limited chirping. + +Recorded MP4 artifact: `recordings/bat_recording.mp4`. This is a 30.0 second, +640x640, 30 fps H.264 MP4 with AAC audio, recorded from `ewgh6l5l` / +`distinctive-surf-293` at fixed level 10 using `--load-model-path latest`. To +make `latest` resolve to this candidate after later runs had newer checkpoints, +the `ewgh6l5l` checkpoint was copied non-destructively to +`checkpoints/bat/ewgh6l5l-latest-eval/0000000033554432.bin`. The video is a +behavior-evidence artifact for the trajectory-prediction/intercept tactic, not a +claim that this run is default-quality. + +Current default candidate is `ewgh6l5l` / `distinctive-surf-293`. It is not the +most robust low-level visual policy, but it is the best current match for the +desired behavior: chirp enough to infer the future bug path, move to that path, +then wait/intercept there. `qkwrqhzy` preserves a similar tactic but looked weaker +overall. `x1ayhg3j` remains useful behavior evidence for blind-map navigation, +and `sfmk59n1` remains important behavior evidence for sparse-chirp +memory/navigation. + +`config/bat.ini` is now intentionally set to `ewgh6l5l` for the current default +candidate. This decision is based on fresh retrain plus level 10 visual eval, not +scalar rank alone. + +## Recommended Next Defaults + +Current default source: + +| parameter | recommendation | +| --- | --- | +| candidate source | `ewgh6l5l` / `distinctive-surf-293` | +| `policy.hidden_size` | `256` | +| `policy.num_layers` | `5` | +| `train.horizon` | `64` | +| `vec.num_buffers` | `4` | +| `env.bug_wing_sideband_gain` | `0.19056934455600955` | +| `env.ear_rear_gain` | `0.22038613968607276` | +| `env.ear_front_gain` | `0.6419214149115183` | +| `env.ear_side_gain` | `0.28043867572747055` | +| `env.ear_separation_scale` | `2.0` | +| `env.reflector_strength` | `0.6` | +| `env.chirp_cooldown_ticks` | `11` | +| `env.curriculum_successes_per_level` | `4` | +| `env.curriculum_bug_distance_step` | `2.0` | + +Keep `--sweep.use-gpu ""` for future Bat9 sweep continuation so Protein stays +off GPU while training uses `--train.gpus 1`. diff --git a/BAT_CURRICULUM.md b/BAT_CURRICULUM.md new file mode 100644 index 0000000000..45a4a0f4cc --- /dev/null +++ b/BAT_CURRICULUM.md @@ -0,0 +1,512 @@ +# Bat Curriculum Strategy + +This note explains how Bat curriculum should work, why `curriculum_perf` can +appear pinned around `0.2`, and how to extend difficulty toward maneuvering bugs +without adding artificial reward dials. + +## Current Diagnosis + +The old `curriculum_perf` behavior was mostly structural. + +The old code computed: + +```text +curriculum_difficulty = (distance_norm + obstacle_norm) / 2 +curriculum_perf = success * curriculum_difficulty +``` + +With the current defaults: + +```ini +curriculum_initial_level = 1 +CURRICULUM_START_OBSTACLES = 0 +CURRICULUM_MAX_OBSTACLES = 3 +curriculum_obstacle_step = 8 +curriculum_start_bug_distance = 8.438 +CURRICULUM_MAX_BUG_DISTANCE = 40.0 +CURRICULUM_BUG_DISTANCE_STEP = 2.0 +CURRICULUM_INBOUND_MAX_BUG_DISTANCE = 56.0 +CURRICULUM_INBOUND_BUG_DISTANCE_STEP = 4.0 +``` + +At `curriculum_level ~= 5`, the bug starts around distance `28`, giving: + +```text +distance_norm ~= (28 - 8) / (56 - 8) ~= 0.42 +obstacle_norm = 0.0 because obstacle count is still 1 +curriculum_difficulty ~= (0.42 + 0.0) / 2 ~= 0.21 +``` + +So `curriculum_perf ~= 0.2` did not necessarily mean the policy hit an +impossible wall. It means the metric gives half of its difficulty credit to +obstacles, but obstacle difficulty stays zero until level `18`. + +The current code logs split difficulty components and computes +`curriculum_difficulty` from active distance and obstacle components: + +```text +distance_norm = normalize(start_bug_dist) +obstacle_norm = normalize(num_obstacles) +motion_norm = 0 until bug maneuvers are added + +curriculum_difficulty = + (0.50 * distance_norm + + 0.50 * obstacle_norm) / active_weight +``` + +Chirp-budget pressure is intentionally excluded from curriculum difficulty. +Motion difficulty is logged separately as `0`, but it does not lower the metric +ceiling before maneuver curricula exist. + +## Design Principles + +- Keep `base_perf` pure: `1.0` for catching the bug, `0.0` otherwise. +- Use composite `perf` for sweep ranking, but always sanity-check it against + `base_perf`, `curriculum_level`, and failure modes. +- Curriculum should change task distribution, not secretly define behavior with + dense reward shaping. +- Chirp timing pressure should come from physical constraints: + finite budget, cooldown, and overlapping echo ambiguity. +- Movement pressure should come from task dynamics, not a hover exploit. The + bat must keep a minimum forward speed so harder levels cannot collapse into + stationary timeout policies. +- Do not add generic timing-efficiency reward. +- Difficulty metrics should be smooth enough that W&B does not show fake + plateaus caused by integer schedule thresholds. +- Every curriculum rung should be reversible and testable. + +## Recommended Metrics + +Add or keep these logs: + +```text +base_perf +perf +curriculum_perf +curriculum_level +curriculum_distance_difficulty +curriculum_obstacle_difficulty +curriculum_difficulty +bug_motion_mode +bug_motion_speed +num_obstacles +chirp_overlap_fraction +collision +timeout +SPS +``` + +Keep the W&B export list capped at 31 explicit metrics because PufferLib appends +`n` as the 32nd metric. Diagnostics such as `budget_difficulty`, +`curriculum_motion_difficulty`, chirp far/near fractions, and redundant inverse +ratios should stay internal unless they are needed for a specific sweep. +Do not remove `score` from `binding.c`; PufferLib's train worker reads +`metrics["env/score"]` when a child process exits. If the cap is tight, drop +`episode_return` instead. + +The key change is splitting `curriculum_difficulty` into components. If +`curriculum_perf` is low, we should be able to tell whether the policy is stuck +on distance, obstacles, or later motion. + +## Recommended Difficulty Formula + +Use an explicit weighted difficulty. Until motion curriculum exists, renormalize +over active components: + +```text +distance_norm = normalize(start_bug_dist) +obstacle_norm = normalize(num_obstacles) +motion_norm = normalize(bug maneuver difficulty) + +curriculum_difficulty = + 0.50 * distance_norm + + 0.50 * obstacle_norm + +curriculum_difficulty /= active_weight + +curriculum_perf = base_perf * curriculum_difficulty +``` + +The exact weights can change, but the important property is that no inactive +component silently cuts the maximum metric in half. If `motion_norm` is not +enabled yet, either log it as `0` and accept the lower ceiling, or renormalize +active components. For sweeps, renormalizing active components is easier to +interpret. + +## Recommended Stage Order + +### Stage 0: Known-good baseline + +Purpose: + +- Preserve the current solved rung as a fallback. + +Task: + +- No obstacles at level 0. +- Moderate starting bug distance. +- Current forward-only bat dynamics. +- Configurable minimum forward speed; brake cannot stop the bat below this + floor. +- Current chirp budget and overlap penalty. + +Gate: + +- `base_perf >= 0.80` +- collision not exploding +- timeout not dominating + +### Stage 1: Distance curriculum + +Purpose: + +- Make the bat solve larger spatial uncertainty before adding more clutter. + +Schedule: + +```text +start_bug_distance = 8 + level * distance_step +``` + +Recommendation: + +- Keep `CURRICULUM_BUG_DISTANCE_STEP` hardcoded at `2.0`; Bat9's best runs + clustered there, and the inbound curriculum already expands later distances. +- Log `curriculum_distance_difficulty` directly. + +Gate: + +- advance after a small success count, not one lucky catch. +- current `curriculum_successes_per_level = 21` is conservative; for 50M-step + runs, sweep lower values such as `4..16`. + +### Stage 2: Obstacle curriculum + +Purpose: + +- Avoid the current metric plateau where obstacle difficulty is invisible until + level `18`. + +Recommendation: + +- Reduce default `curriculum_obstacle_step`. +- The current default is `4`, with only level 0 obstacle-free: + +```text +level 0: 0 obstacles +level 1..4: 1 obstacle +level 5..8: 2 obstacles +level 9+: 3 obstacles +``` + +Alternative: + +- Keep the count schedule but add obstacle size or reflector strength as a + smoother sub-rung. + +Gate: + +- Do not reduce chirp budget as obstacle count rises; clutter legitimately + requires reacquisition chirps. + +### Stage 3: Chirp budget curriculum + +Purpose: + +- Force useful chirp timing without artificial timing reward. + +Current behavior: + +- Observation includes `chirps_used / chirp_budget`. +- Chirping after the last chirp causes `-1` terminal. +- Budget is fixed across curriculum levels. +- Valid chirps before the previous max echo window clears get a physical + overlap penalty. + +Recommendation: + +- Keep terminal/logging pressure as the primary signal: + +```text +chirp_perf = clamp(1.0 - chirps_emitted / 15.0, 0.05, 1.0) +perf = base_perf * curriculum_difficulty * chirp_perf +``` + +- Keep `chirp_overlap_penalty` small and sweepable. +- Treat `chirp_overlap_fraction` as a diagnostic, not the main objective. +- Keep chirp-budget ratio as an observation instead of an exported diagnostic; + the fixed 15-chirp reference gives cleaner Protein ranking pressure across + 10, 8, and 6 chirp policies. + +### Stage 4: Constant-velocity moving bug + +Purpose: + +- Make Doppler and reacquisition matter while keeping motion predictable. + +Current/near-term model: + +- Fixed speed and heading. +- Wall bounce: + - vertical wall flips `vx` + - horizontal wall flips `vy` + +Recommended knobs: + +```text +bug_speed +bug_wall_bounce_enabled +``` + +Gate: + +- Require maintained `base_perf` and non-collapsing chirp behavior, checked + through `chirps_emitted` and `chirp_perf`. +- Motion should not start before distance and obstacle rungs are stable. + +### Stage 5: Simple bug maneuvers + +Purpose: + +- Add nontrivial pursuit without jumping directly to adversarial behavior. + +Recommended maneuver order: + +1. Sine lateral motion + - Bug keeps forward velocity but adds low-frequency lateral acceleration. + - Knobs: `bug_maneuver_amplitude`, `bug_maneuver_period`. + +2. Smooth heading drift + - Bug heading changes slowly with bounded turn rate. + - Knobs: `bug_turn_rate`, `bug_turn_period`. + +3. Circular or oval path segments + - Bug follows simple parametric curves. + - Knobs: `bug_orbit_radius`, `bug_orbit_period`. + +4. Piecewise constant heading changes + - Bug chooses a new heading every N ticks. + - Knobs: `bug_heading_change_interval`, `bug_heading_change_angle`. + +5. Mild evasive steering + - Bug slowly biases away from bat only at higher curriculum. + - This should come late because it changes the task from tracking to pursuit. + +Do not expose bug mode, bug position, or true velocity in observations. The bat +should infer motion from echoes. + +## Proposed Curriculum State + +Keep the current single integer `curriculum_level`, but derive separate +sub-difficulties from it: + +```text +distance_level = level +obstacle_level = max(0, level - obstacle_start_level) +budget_level = max(0, level - budget_start_level) +motion_level = max(0, level - motion_start_level) +``` + +Recommended starting points: + +```ini +obstacle_start_level = 4 +budget_start_level = 0 +motion_start_level = 10 +maneuver_start_level = 16 +``` + +This keeps one scalar progression while preventing all difficulty knobs from +turning on at once. + +## Eval Requirements + +Eval should support fixed curriculum levels so we can inspect specific rungs. + +Useful modes: + +```text +default latest curriculum level +fixed level 0 +fixed level 6 +fixed level 12 +fixed level 18 +fixed maneuver mode +``` + +This matters because an aggregate training metric can look fine while a +specific rung has bad behavior. + +## TDD Targets + +Add focused tests before changing curriculum code: + +```text +curriculum_difficulty_logs_distance_and_obstacle_components +curriculum_obstacles_advance_before_level_18 +curriculum_budget_reduces_monotonically_with_level +curriculum_motion_stays_zero_before_motion_start_level +bug_wall_bounce_flips_x_or_y_velocity +bug_sine_maneuver_keeps_speed_bounded +eval_fixed_curriculum_level_overrides_training_level +``` + +## Near-Term Recommendation + +Before adding maneuvers, do this: + +1. Add split difficulty logs: + - `curriculum_distance_difficulty` + - `curriculum_obstacle_difficulty` + - `curriculum_motion_difficulty` + +2. Change obstacle schedule so it starts contributing around level `6`, not + level `18`. + +3. Consider reducing `curriculum_successes_per_level` default from `21` to a + faster value such as `8..12`, while keeping it sweepable. + +4. Add fixed-level eval override so visual checks can inspect harder rungs + directly. + +5. Only then add simple bug maneuvers, starting with constant velocity wall + bounce and then sine/heading drift. + +The goal is a ladder where each rung is visibly harder, metrics explain why, +and the bat must improve sensing behavior without reward terms that directly +script the desired chirp timing. + +## Current Curriculum Cleanup + +Chirp-budget pressure is no longer mixed into curriculum difficulty. + +Rationale: + +- More obstacles legitimately require more chirps. A cluttered arena should not + rank worse just because the bat used more chirps than it would in open space. +- Chirp count is a sensing-efficiency metric, not a world-difficulty metric. +- A shrinking chirp budget can make later curriculum levels impossible before + we know whether the policy has learned robust obstacle disambiguation. + +Current curriculum split: + +```text +level 0: + no obstacles + moving bug only + +later levels: + increase bug start distance + introduce the first obstacle immediately at level 1 + increase obstacle count/clutter every few levels + then add maneuvering bug motion +``` + +Current curriculum difficulty: + +```text +curriculum_difficulty = + 0.5 * distance_difficulty + + 0.5 * obstacle_difficulty +``` + +If a component has not been activated yet, renormalize over active components +instead of letting inactive components cap the score. Once obstacle curriculum +is active, the two-component `0.5 / 0.5` interpretation is easy to explain: +half distance, half clutter. + +Current chirp handling: + +```text +MAX_CHIRPS_PER_EPISODE = 15 +chirp_budget does not decrease with curriculum level +chirp_budget_difficulty is removed from curriculum difficulty +``` + +Reward/perf pressure for chirps should focus on intelligent use, not simply +fewer chirps everywhere: + +- Keep a finite chirp budget so chirps are not unlimited. +- Keep overlap penalty because overlapping echo returns are physically + ambiguous. +- Consider rewarding successful catches with a chirp-use bonus based on + `chirps_emitted / 15`, but avoid dense shaping that scripts exact chirp + timing. +- Keep `chirp_perf = clamp(1.0 - chirps_emitted / 15.0, 0.05, 1.0)` as a sweep + diagnostic/objective term, but interpret it together with obstacle difficulty + and not as an absolute "fewer chirps is always better" rule. + +This cleanup changes how `perf` compares to older Bat sweep runs. Compare old +and new runs through component logs (`base_perf`, `curriculum_perf`, +`chirp_perf`) when needed. + +## Bat3 Partial Sweep Notes + +These notes are from an in-progress `bat3` W&B peek on June 9, 2026. Treat them +as directional, not final. + +Early top-`perf` runs show: + +- `bat_min_speed` tends toward the low end, usually near `2.0`. +- `bat_turn_rate` tends toward the high end, often near `3pi`. +- `sound_speed` tends toward the high end, often `175..180`. +- `progress_reward_scale` tends toward the high end, around `0.11..0.12`. +- Good policies often catch with roughly `6..8` chirps. +- Highest `base_perf` runs can exceed `0.90`, but may use more chirps and rank + lower by `perf`. +- Highest `curriculum_perf` runs reach around levels `8..9`, but often pay with + higher collision/timeout and around `10` chirps. + +Behavior read: + +- The current interesting behavior is circle-search followed by full-speed + dash/intercept after apparent target acquisition. +- Do not remove this behavior unless harder fixed-level evals prove it is an + exploit. It may be a useful active-sensing search pattern. + +Metric implication: + +- `chirp_perf` is working as a sweep ranking term, but it also confirms that + "fewer chirps" cannot be the whole story once obstacles increase. +- More clutter can legitimately require more chirps, so chirp count should stay + separate from world/curriculum difficulty. + +Post-sweep PR gate: + +1. Update `config/bat.ini` defaults from the best sane run, not merely highest + `perf`. +2. Run one normal training pass with those defaults. +3. Eval fixed levels `0`, `4`, `7`, and `10`. +4. Commit only if visual behavior remains sane and the policy does not regress + to hover/spin/collision farming. + +Visual eval diagnosis from `fresh-wood-149`: + +- Default/easier eval looks good: circle-search followed by dash/intercept. +- Fixed level `10` performs poorly. +- Fixed level `7` reveals the likely failure mode: + - the bat spends many chirps during early search before acquiring the bug, + - once it finally gets a useful bug signal, the remaining chirp budget is low, + - it dashes toward the last known/acquired bug direction, + - if the bug moves enough after the final echoes, the bat keeps flying blind + and misses. + +Implication: + +- The next bottleneck is acquisition/reacquisition under finite chirp budget, + not basic forward flight. +- Be careful with any metric or reward that simply minimizes chirp count. At + harder distances or with obstacles, useful policies may need more search and + reacquisition chirps. +- This supports the current cleanup: keep a fixed chirp budget for now and keep + chirp pressure separate from curriculum difficulty before adding harder + motion or more clutter. + +## Reward-Shaping Guardrails + +Bug-echo progress shaping is allowed, but it must not pay for passive target +motion. Compare the current bug echo path against the previous bug echo only +after the bat has displaced by at least `bug_echo_min_displacement`. If the echo +path is shorter, reward by `bug_echo_reward_scale`; if it is longer, apply a +weaker penalty using `bug_echo_farther_penalty_scale`, currently defaulting to +`0.10`. diff --git a/BAT_PRIORITIES.md b/BAT_PRIORITIES.md new file mode 100644 index 0000000000..6df2accefb --- /dev/null +++ b/BAT_PRIORITIES.md @@ -0,0 +1,65 @@ +# Bat Priorities + +Current near-term priorities for the Bat PufferLib environment. + +## 0. Video capture with audio + +- RayLib can render and play audio, but it does not natively encode MP4. +- Preferred path: keep RayLib as the renderer/audio source, capture frames/audio + during eval, and use `ffmpeg` to mux an MP4. +- A future helper should make this feel like one command, but avoid embedding an + MP4 encoder in the env. +- Existing GIF capture remains useful for quick silent demos. +- Later render polish: play audible reflection blips in addition to emitted + chirps. Keep this eval-only. Bug reflections and static wall/obstacle + reflections should likely use distinguishable volume, timbre, panning, or + marker sounds so the debug audio stays interpretable. + +## 1. Add episode timer observation + +- Add a normalized episode timer observation so the policy knows urgency. +- For the current `max_steps = 512` Bat episode budget, expose a float in + `[0, 1]` representing elapsed time from `0` ticks to timeout. If the budget is + later changed to exactly `500`, scale the same way from `0..500`. +- The Bat8 visual evals show a likely failure mode where policies chirp too + little, settle into circling, and time out. Without a timer observation, the + policy has no direct signal that it is running out of episode time. + +## 2. Bug-reflection chirp timing penalty + +- Replace broad "chirp before all echoes clear" pressure with bug-specific + timing pressure. +- Penalize a valid chirp if it is emitted before the previous chirp's expected + bug reflection has returned. +- Scale the penalty by remaining wait fraction, so chirping immediately after a + prior chirp is worse than chirping shortly before the bug echo arrives. +- Keep the coefficient sweepable through `chirp_overlap_penalty`. +- Do not penalize based on all static wall/obstacle reflections; clutter may + legitimately require reacquisition chirps. + +## 3. Resume performance work + +- Use level 7 and level 10 evals as visual sanity checks. +- Focus on harder-level failures where the bat spends chirps before acquiring + the bug. +- Keep reward shaping minimal and prefer terminal/curriculum/perf pressure where + possible. + +## 4. Prepare the next sweep + +- Make sure the next sweep includes any new timing penalty coefficient ranges. +- Sweep `chirp_cooldown_ticks` in a bounded range. Current range is `6..18`. +- Keep `max_chirps_per_episode` fixed at `15` for this sweep so budget does + not confound timing penalty and cooldown effects. +- Cap policy sweep size at `hidden_size = 64..256` and `num_layers = 2..4` so + overnight sweeps do not waste runs on very slow oversized networks. +- Keep sweep ranges bounded so runs cannot become extremely slow from oversized + policies or excessive env settings. +- Watch `perf`, `base_perf`, `curriculum_perf`, `chirps_emitted`, + `chirp_overlap_fraction`, `chirp_tempo_ratio`, `collision`, and SPS. + +## Priority judgment + +The current ordering is sound: the video/audio capture work is useful for demos, +but the bug-reflection timing penalty is more likely to improve level 7/10 +performance before the next sweep. diff --git a/BAT_SONAR_OBSERVATION_NOTES.md b/BAT_SONAR_OBSERVATION_NOTES.md new file mode 100644 index 0000000000..238138bdb3 --- /dev/null +++ b/BAT_SONAR_OBSERVATION_NOTES.md @@ -0,0 +1,260 @@ +# Bat Sonar Observation Notes + +Status: design note for current and future Bat agents + +Workspace: `/home/claude/pathfinder` + +Related spec: `BAT_SPEC.md` + +## Purpose + +This note records the intended next observation and echo model for the Bat +environment. The current implementation was deliberately simplified to get a +trainable baseline. The next rung should make active echolocation real: the bat +should hear frequency energy only when echoes from its own chirps return. + +## Retired Scaffold Implementation + +The first Bat observation was a fast synthetic feature extractor, not a true +chirp-return audio model. It has been retired, but the notes are kept here so +future agents understand why the env moved away from it. + +Current layout: + +- `left_range_energy[16]` +- `left_doppler_energy[16]` +- `right_range_energy[16]` +- `right_doppler_energy[16]` +- `chirp_age_norm` +- `last_chirp_start_freq_norm` +- `last_chirp_end_freq_norm` +- `last_chirp_duration_norm` +- `forward_speed_norm` +- `turn_rate_norm` + +Total size: `70`. + +Each frame, the env recomputes current echo features from the current bat, +bug, wall, and obstacle positions. The bug is one strong moving reflector. +Walls and obstacle edges are sampled into static point reflectors. For each +reflector, the env computes approximate left-ear and right-ear path lengths, +attenuation, left/right gain, and a normalized Doppler value. It then deposits +energy into range-indexed observation slots. + +This is useful for a first baseline, but it is too informative: + +- The bat gets fresh echo-like information every frame, even if it did not + chirp. +- Chirp start frequency, end frequency, and duration do not materially affect + the acoustic observation. +- The Doppler channels are scalar range-indexed values, not FFT bins. +- Range is exposed as direct binned path length instead of being inferred from + echo return timing. + +## Current Target Model + +The observation should be per-tick binaural frequency energy: + +- `left_freq_bins[N]` +- `right_freq_bins[N]` +- chirp metadata +- cooldown/age metadata +- self-motion metadata + +No explicit delay/range bins are needed in the observation. Distance should be +implicit in time. The policy should infer range from when frequency energy +returns after a chirp. + +Current layout: + +- `left_freq_bins[16]` +- `right_freq_bins[16]` +- `chirp_age_norm` +- `chirp_cooldown_norm` +- `last_chirp_start_freq_norm` +- `last_chirp_end_freq_norm` +- `last_chirp_duration_norm` +- `forward_speed_norm` +- `turn_rate_norm` + +Total size: `39`. + +If 16 bins is too coarse after implementation, use 24 bins per ear for a total +size of `55`. + +## Event-Driven Echo Model + +Do not synthesize raw audio and do not run an FFT per environment step. Use an +analytic event model that directly deposits echo energy into frequency bins at +the tick when the echo reaches each ear. + +When a chirp is emitted: + +1. Break the chirp into a small number of time slices. +2. For each slice, compute the emitted frequency from chirp start frequency, + end frequency, and duration. +3. For each reflector, compute when that slice reaches the reflector. +4. Compute when the reflected sound reaches the left ear and right ear. +5. Compute returned amplitude, ear gain, and Doppler-shifted frequency. +6. Enqueue an echo event for each ear. + +Each echo event should store: + +- receive time in continuous ticks or seconds +- target ear +- returned normalized frequency +- intensity +- source chirp identifier or chirp birth tick, if useful for debugging + +On each env tick: + +1. Clear left/right frequency bins. +2. Process all echo events whose receive time falls in the current tick window. +3. Deposit event intensity into the relevant frequency bin, with optional + fractional spill into neighboring bins. +4. Add a small configurable noise floor. +5. Apply bounded compression, such as `log1p(k * energy) / log1p(k)`. +6. Append chirp and self-motion metadata. + +This produces the desired behavior: + +- No chirp means no new echo energy, aside from noise or any intentionally + modeled lingering sensor state. +- A low-to-high chirp creates a time-coded return pattern. +- Multiple reflectors can overlap naturally in the same tick and frequency + bin. +- Range must be inferred from echo timing, not from a direct range channel. + +## Example: Two-Frequency Chirp and Two Targets + +Assume two frequency bins: low and high. + +The bat emits a two-slice chirp: + +- slice 0: high frequency +- slice 1: low frequency + +There are two static targets, one near and one far. With zero Doppler, the +per-tick ear spectrum could look like: + +```text +[0, 0] sound still traveling +[0, 0] sound still traveling +[0, 1] near target returns high slice +[1, 1] near target returns low slice, far target returns high slice +[1, 0] far target returns low slice +[0, 0] no active returns +``` + +This is the intended observation style. It is not a delay-bin representation. +The temporal sequence itself contains the delay/range information. + +## Timing and Physics Notes + +Echo timing is two-way: + +```text +emit position -> reflector -> ear +``` + +For static reflectors, the approximate return time is: + +```text +t_receive = t_emit + + distance(chirp_origin, reflector) / sound_speed + + distance(reflector, ear_at_receive) / sound_speed +``` + +For moving reflectors, such as the bug, the hit time should use predicted +reflector position at the time of impact. A linear-motion approximation is good +enough for the next implementation. + +Doppler should be based on the rate of change of the acoustic path length: + +```text +doppler_shift ~= -path_length_rate / sound_speed +``` + +Static walls and obstacles can still have Doppler from bat self-motion. The +moving bug additionally contributes target radial velocity. + +Use fractional receive times internally. The env control tick can stay at +`1/60` second while echo events are scheduled at sub-tick times and deposited +into the nearest tick or split across adjacent ticks. + +## Chirp Overlap and Memory + +Without explicit delay bins, the policy needs temporal memory to infer range. +The observation at a single tick only says what frequency energy is arriving +now. It does not directly say how long ago that sound was emitted unless the +policy remembers the chirp sequence or the env provides reliable chirp-age +metadata. + +For the next rung, use one active chirp at a time: + +- `chirp_cooldown_ticks >= max_echo_return_ticks` +- include `chirp_age_norm` +- include last chirp start frequency, end frequency, and duration + +This keeps return timing interpretable before adding overlapping chirps. Later +curriculum stages can reduce cooldown and allow ambiguity from multiple active +chirps. + +## Performance Constraints + +The target is high SPS. Avoid raw waveform buffers, convolution, and per-step +FFT. + +Use: + +- a fixed upper bound on active chirps +- a fixed upper bound on echo events +- static reflector precomputation after reset +- direct frequency-bin deposition +- simple geometric attenuation and ear gain +- first-order reflections only + +The expected work per tick should stay near: + +```text +active_chirps * chirp_slices * reflectors * ears +``` + +With small constants, this remains cheap C code and should preserve the spirit +of the current native PufferLib env. + +## Implementation Direction + +The next implementation should replace current range/Doppler observation +generation with an event queue. + +Suggested data structures: + +- `ChirpEvent`: emitted chirp metadata, birth time, origin, frequency sweep +- `Reflector`: position, velocity, strength, normal or type +- `EchoEvent`: receive time, ear, frequency, intensity + +Suggested tests: + +- no chirp produces no echo energy beyond noise +- single static reflector returns at expected two-way travel time +- left and right ears receive slightly different timings/intensities off-axis +- two chirp slices and two reflectors produce the expected overlapping bin + pattern +- moving bug shifts frequency in the expected Doppler direction +- cooldown prevents ambiguous overlapping chirps in the initial curriculum +- bug echo progress reward only fires when the echo-derived bug path is shorter + than the previous bug echo path +- static echoes never receive bug echo progress reward + +## Non-Goals for the Next Rung + +Do not add raw audio synthesis yet. + +Do not add an actual FFT dependency yet. + +Do not add full wave acoustics. + +Do not add multi-bounce reverberation yet. + +Do not expose direct range bins if the goal is to force temporal echolocation. diff --git a/BAT_SPEC.md b/BAT_SPEC.md new file mode 100644 index 0000000000..6c58f8f901 --- /dev/null +++ b/BAT_SPEC.md @@ -0,0 +1,646 @@ +# Bat Environment Spec + +Status: draft baseline; ready for implementation planning after review + +Workspace: `/home/claude/pathfinder` + +Target branch: `bat` + +Target env name: `bat` + +Detailed sonar observation design note: + +- `BAT_SONAR_OBSERVATION_NOTES.md` + +## Intent + +Build a single-agent PufferLib Ocean environment inspired by bat echolocation. +The agent controls a bat flying in a 2D arena with walls, static obstacles, and +a moving bug target. The bat must avoid collisions and catch the bug using +binaural acoustic returns from self-generated chirps rather than direct map or +position observations. + +The first version should copy the small native-C env style used by Breakout: +fixed-size observations, a compact action space, simple deterministic physics, +and enough instrumentation to make training failures debuggable. + +The core challenge is active sensing. The policy must learn both how to move +and how to emit useful chirps. The environment should make chirping meaningful +without turning v1 into a full acoustic wave simulator. + +## Research Grounding + +Range cue: + +- Echolocating bats primarily estimate target distance from the delay between + an emitted call and the returning echo. +- Source: https://pmc.ncbi.nlm.nih.gov/articles/PMC9157489/ +- Source: https://pmc.ncbi.nlm.nih.gov/articles/PMC7888678/ + +Binaural direction cue: + +- Left/right ear differences are biologically plausible and useful. Bats use + binaural and spectral cues, including head-related transfer effects, to infer + sound direction. +- Source: https://pubmed.ncbi.nlm.nih.gov/15658710/ +- Source: https://pmc.ncbi.nlm.nih.gov/articles/PMC4555857/ + +Chirp design: + +- Linear frequency-modulated chirps are standard in radar and sonar because + matched filtering can compress a long emitted pulse into a sharp return peak. + Bandwidth controls range resolution, and the time-bandwidth product controls + processing gain. +- Source: https://rfessentials.com/rf-knowledge-base/how-does-pulse-compression-improve-the-range-resolution-and-sensitivity-of-a-rad/ + +Doppler: + +- Doppler shift is a useful velocity cue, especially for moving targets and + insect-like prey. Some bat species actively compensate call frequency to keep + important echo bands in a sensitive range. +- Source: https://pmc.ncbi.nlm.nih.gov/articles/PMC2438418/ +- Source: https://www.nature.com/articles/s41598-018-22880-y + +Fast signal processing: + +- FFTW is the high-performance reference point for C FFT design, but v1 should + avoid adding FFTW as a dependency. A fixed-size radix-2 FFT or precomputed + analytic matched-filter bins are preferred. +- Source: https://www.fftw.org/fftw2_doc/fftw_1.html +- Source: https://web.stanford.edu/class/cme324/classics/cooley-tukey.pdf + +Reflection model: + +- Full wave acoustics is out of scope for v1. A geometric echo model is the + right first approximation: sound travels in straight paths, reflects from + objects, and returns with delay, angle-dependent ear gain, attenuation, and + optional Doppler. +- Source: https://au.mathworks.com/help/audio/ug/room-impulse-response-simulation-with-image-source-method-and-hrtf-interpolation.html + +## Environment Model + +World: + +- 2D continuous rectangular arena. +- Arena dimensions are fixed by config. +- Boundaries are solid walls. +- Static obstacles are axis-aligned rectangles. +- The bug is a moving circular target. +- The bat is a moving circular agent with heading, speed, turn rate, and + collision radius. + +Physics: + +- Fixed control/physics timestep, default `1/60` second. +- Bat motion is acceleration-limited and turn-rate-limited. +- The bat has a configurable minimum forward speed. It cannot hover; brake + only reduces speed down to this stall-speed floor. +- Bug motion uses a simple deterministic or seeded random policy. +- The bug reflects from walls and obstacles. +- The bat collides with walls and obstacles. +- Catch success occurs when bat and bug circles overlap. +- The `1/60` second tick is not the acoustic sample rate. Echo delays are + computed analytically with fractional timing inside each env step. + +Acoustics: + +- Walls, obstacles, and the bug reflect chirps. +- Static reflectors provide range and direction cues. +- The bug is the only moving reflector, so it is the main Doppler source. +- The env computes compact acoustic features analytically instead of storing or + convolving high-rate audio samples. +- Sound speed is configurable and artificial. The default should be much slower + than real air acoustics so echo timing is learnable in a small game arena. +- Start with `sound_speed = 60.0` world units per second. At the default + `1/60` second env tick and current ear spacing, this gives broadside echoes + enough artificial time-of-arrival separation for one ear to be able to hear a + return about one tick before the other. +- `ear_separation_scale` controls the artificial distance between ears as a + multiple of `AGENT_RADIUS`. Keep it bounded; the implementation clamps it to + `[0.25, 2.0]` and the default sweep range is `[0.5, 2.0]`. +- Every echo contribution has: + - two-way distance from mouth/source to reflector to each ear, + - delay derived from speed of sound, + - amplitude falloff from distance and reflector strength, + - left/right ear gain from relative azimuth, + - Doppler shift from reflector radial velocity. + +Point-reflector renderer: + +- v1 should represent walls and obstacle surfaces as stationary point + reflectors. +- Sample each wall and obstacle edge at fixed spacing, + `REFLECTOR_SPACING = 8.0` world units. +- The bug contributes one moving circular/point reflector at its center. +- This avoids wavefront bookkeeping while preserving range, angle, and Doppler + learning signals. + +First-order echoes only: + +- v1 should include direct echo paths from visible surfaces and the bug. +- Multiple-bounce reverberation is out of scope for v1. +- Occlusion can be approximated by ray intersection against the nearest + obstacle along the bat-to-reflector path. +- Segment-level specular reflection and raw waveform propagation are later + variants, not the v1 baseline. + +## Chirp Model + +The policy controls chirp parameters rather than emitting arbitrary audio. + +Chirp parameters: + +- `chirp_start_freq` +- `chirp_end_freq` +- `chirp_duration` + +Derived fields: + +- `chirp_bandwidth = abs(chirp_end_freq - chirp_start_freq)` +- `chirp_slope = (chirp_end_freq - chirp_start_freq) / chirp_duration` +- `chirp_age_ticks = ticks since most recent emitted chirp` + +Defaults: + +- Frequency range is normalized in the policy/action interface and mapped to a + narrow ultrasonic band in the env. +- Duration is normalized in the policy/action interface and mapped to a small + tick/subtick window. +- Up-chirps and down-chirps are both legal. +- A zero-amplitude/no-chirp action should be available so the bat is not forced + to emit every tick. + +Implementation direction: + +- Start with analytic range/Doppler bins, not literal audio buffers. +- If an FFT is needed, use a fixed power-of-two size with precomputed twiddle + factors. +- Prefer precomputed chirp templates or direct bin accumulation for v1 because + this env will run thousands of agents in parallel. +- The v1 observation bins are not raw FFT bins. They are compact + matched-filter-like echo features derived from chirp parameters, delay, + amplitude, and normalized Doppler. +- RayLib eval rendering may play an audible debug version of emitted chirps. + This is render-only and must not run in headless training. The audible sound + maps the normalized chirp band to a human-hearable swept sine while preserving + the selected start frequency, end frequency, and duration. +- RayLib eval rendering also supports `env.render_target_fps`; default `60`, + and `0` leaves RayLib uncapped. This is for visualization/audio inspection + only and should not be used in training or sweep interpretation. Audible + debug chirp duration scales as `max(1, 60 / render_target_fps)` so low-FPS + inspection preserves chirp ordering while making each sweep easier to hear. + +## Action Space + +Use a small multi-discrete action space. + +Recommended v1 action heads: + +- `move`: 3 values + - `0`: no thrust + - `1`: thrust forward + - `2`: brake / reduce forward speed +- `turn`: 3 values + - `0`: no turn + - `1`: turn left + - `2`: turn right +- `chirp_start_freq`: discrete bins, default `8` +- `chirp_end_freq`: discrete bins, default `8` +- `chirp_duration`: discrete bins, default `4` +- `chirp_emit`: 2 values + - `0`: do not emit a chirp this tick + - `1`: emit chirp using selected chirp parameters + +Initial action sizes: + +- `ACT_SIZES {3, 3, 8, 8, 4, 2}` +- `NUM_ATNS 6` + +Rationale: + +- Multi-discrete actions let the agent combine flight and active sensing. +- Discrete chirp bins keep the policy simple and cheap. +- Bat movement is scalar forward speed plus heading. The velocity vector is + recomputed as `heading * speed` every tick. +- Brake clamps speed at `bat_min_speed`. The bat cannot fly backward or hover. +- Strafe/lateral velocity is intentionally unavailable. This avoids sideways + spiral policies and makes the visual behavior match the game fantasy better + than a full inertial top-down spacecraft model. +- Continuous actions can be a later variant after the first training baseline + is understood. + +## Observation Space + +Do not expose absolute position, absolute bug position, obstacle map, or global +heading. + +Observation layout: + +1. `left_freq_bins[16]` +2. `right_freq_bins[16]` +3. `chirp_age_norm` +4. `chirp_cooldown_norm` +5. `last_chirp_start_freq_norm` +6. `last_chirp_end_freq_norm` +7. `last_chirp_duration_norm` +8. `chirps_used_norm = chirps_used / chirp_budget` +9. `forward_speed_norm` +10. `turn_rate_norm` +11. `timer_norm = elapsed_steps / MAX_STEPS`, clamped to `[0, 1]` + +Initial observation size: + +- `OBS_SIZE = 41` + +Timer normalization: + +- The timer starts at `0.0` on reset. +- With `MAX_STEPS = 512`, after step `N` the observation is + `N / 512.0`. +- The observed timer is clamped to `[0.0, 1.0]`. + +Echo bins: + +- Each ear receives 16 frequency-intensity bins. +- Bins represent the summed intensity arriving at that ear during the current + env tick. +- Values are capped to `[0.0, 1.0]` before policy input. +- No explicit delay/range bins are exposed. +- No chirp means no new echo energy, aside from any later noise model. +- Range must be inferred from when frequency energy returns after an emitted + chirp. +- Doppler shifts move return energy across nearby frequency bins instead of + appearing in a separate Doppler observation channel. + +Echo timing: + +- Chirps schedule analytic echo-arrival events. +- Each event has a receive time, ear, normalized frequency, and intensity. +- On each tick, all events arriving in that tick window are summed into the + corresponding ear frequency bins. +- Multiple reflectors can contribute to the same bin on the same tick. +- Echoes beyond `MAX_ECHO_RANGE` are ignored. +- Implementation should use a fixed future-tick accumulator, not a full active + event scan every env step. The current design buckets each echo by + `ceil(receive_tick)` into `ECHO_QUEUE_TICKS = 256`, sums by + `[ear][freq_bin]`, and processes only the current tick's bucket. +- The accumulator is an implementation detail only. It must preserve the + observation semantics: current-tick per-ear frequency intensities are summed + and capped to `[0.0, 1.0]`; no range/delay axis is exposed. + +Chirp metadata: + +- The agent receives the last emitted chirp start frequency, end frequency, and + duration because interpreting a return depends on knowing the transmitted + signal. + +Current implementation note: + +- The range/Doppler scaffold has been retired in favor of per-tick left/right + frequency spectra generated by analytic echo-arrival events. +- Range is inferred from echo timing and chirp age rather than exposed as an + observation axis. +- See `BAT_SONAR_OBSERVATION_NOTES.md` before changing acoustic observations. +- `chirp_age_norm` lets the policy distinguish fresh echo windows from stale or + silent intervals. + +Self-motion: + +- `forward_speed_norm` and `turn_rate_norm` are proprioceptive signals. +- `forward_speed_norm` is normalized scalar speed and should stay in `[0, 1]`. +- These do not reveal map coordinates or target location. +- They reduce unnecessary burden on recurrent policy memory. + +Model memory note: + +- PufferLib has recurrent policy support through `MinGRU`, `GRU`, and `LSTM`. +- The default config currently uses `MinGRU`, but v1 should not require the + policy to remember chirp identity just to interpret the current acoustic + observation. + +## Reward and Termination + +Reward shaping is intentionally simple in v1. It should make pursuit learnable +without leaking any privileged information through observations. + +Default reward model: + +- `+1.0` for catching the bug. +- Small negative step cost to encourage efficient pursuit. +- Dense progress reward based on reduction in true bat-to-bug distance. +- `-1.0` for hitting walls or obstacles, terminal. +- `-1.0` for timeout, terminal. +- `-1.0` for attempting a chirp after `chirps_used_norm` reaches `1.0`, + terminal. +- Tiny chirp cost so constant chirping is not fully free without causing + chirp collapse. +- Chirping again before the prior chirp's expected bug reflection has returned + gets a small physical overlap penalty. This is not a generic timing-efficiency + reward; it represents self-induced acoustic ambiguity from overlapping bug + returns without forcing the bat to wait for every static wall or obstacle + reflection. +- Solve-time chirp efficiency reward: + - `chirp_efficiency = 0.5 + 0.5 * (1.0 - chirps_used / chirp_budget)`, + - a catch after spending the full budget gets efficiency `0.5`, + - a catch with very few chirps approaches efficiency `1.0`, + - `chirp_efficiency_reward` scales this bonus and should be sweepable. +- Sound-derived bug echo progress reward: + - when a bug echo returns with a shorter acoustic path than the previous bug + echo, add a small shaped reward, + - this reward only applies if the bat has moved at least + `BUG_ECHO_MIN_DISPLACEMENT` since the previous scored bug echo, so a + stationary bat cannot farm reward from the bug moving closer by itself, + - farther bug echoes update the previous bug echo path and receive a weaker + penalty scaled by `bug_echo_farther_penalty_scale`, default `0.10`, + - static wall and obstacle echoes do not receive this reward. +- Optional silence bonus or energy budget should wait until the basic task + trains. + +Progress reward: + +- Track previous true bat-to-bug distance internally. +- Reward positive distance reduction. +- Penalize distance increase by the same or smaller scale. +- Do not expose the true distance in observations. +- Default formula: + - `reward += progress_reward_scale * (prev_bug_dist - bug_dist)` + - `reward -= step_cost` + - `reward -= CHIRP_COST` when a chirp is emitted; this is hardcoded to + zero for the current Bat defaults + - `reward -= chirp_overlap_penalty * bug_echo_wait_fraction` when a valid + chirp is emitted before the previous chirp's expected bug reflection has + returned + - `reward += chirp_efficiency_reward * chirp_efficiency` on catch + - `reward += bug_echo_reward_scale * echo_path_reduction / MAX_ECHO_RANGE` + when a returning bug echo indicates the bug is closer than the previous bug + echo and the bat has moved enough since that previous echo + - `reward -= bug_echo_reward_scale * bug_echo_farther_penalty_scale * + echo_path_increase / MAX_ECHO_RANGE` when a later moved-enough bug echo is + farther away +- Default starting values: + - `progress_reward_scale = 0.05` + - `step_cost = 0.001` + - `chirp_efficiency_reward = 1.0` + - `chirp_overlap_penalty = 0.004` + - `bug_echo_reward_scale = 0.02` + +Important caveat: + +- Dense distance reward is privileged training signal. It is acceptable for v1 + if the goal is to get learning started, but it should be easy to disable or + scale down once the acoustic policy learns basic pursuit. + +Termination: + +- Success: bat catches bug. +- Failure: bat collides with a wall or obstacle. +- Failure: bat attempts to chirp after exhausting the chirp budget. +- Timeout: `tick >= MAX_STEPS`. + +Reset: + +- New episode samples arena layout, bat spawn, bug spawn, and bug velocity. +- Bat and bug should not spawn overlapping obstacles or each other. +- Initial bug distance should support curriculum. + +W&B exported metrics: + +- Keep the explicit `dict_set(out, ...)` list in `binding.c` small. PufferLib + appends `n`, and lower-value diagnostics should stay internal unless they + are actively needed for sweep decisions. + +- `perf` + - composite sweep objective: + `base_perf * curriculum_difficulty * chirp_perf` +- `base_perf` + - pure catch rate: `1.0` for catching the bug, `0.0` otherwise +- `curriculum_level` +- `curriculum_difficulty` + - weighted normalized episode difficulty from split curriculum components +- `curriculum_perf` + - `base_perf * curriculum_difficulty`; useful diagnostic for level progress + without chirp-budget weighting +- `curriculum_distance_difficulty` +- `curriculum_obstacle_difficulty` +- `score` + - required by PufferLib train worker; do not remove from `binding.c` +- `episode_return` +- `episode_length` +- `collision` +- `timeout` +- `num_obstacles` +- `chirps_emitted` +- `chirp_perf` + - sweep-objective chirp multiplier: + `clamp(1.0 - chirps_emitted / 15.0, 0.05, 1.0)` + - this uses a fixed 15-chirp reference instead of the current per-level + budget so 6-chirp and 8-chirp policies remain meaningfully separated +- `chirp_overlap_fraction` + - fraction of emitted chirps that were sent before the previous chirp's max + return window cleared +- `n` + +## Curriculum + +The first curriculum should keep obstacles present but make target behavior +simple before adding maneuvering. + +Recommended stages: + +- Stage 0: fixed arena, boundary walls, simple fixed obstacles, slow bug with + fixed velocity and bounce behavior. +- Stage 1: same layout class, faster bug with fixed velocity and bounce + behavior. +- Stage 2: randomized obstacles, slow bug with fixed velocity and bounce + behavior. +- Stage 3: randomized obstacles, faster bug with small seeded random turns. +- Stage 4: randomized obstacles, faster bug that can maneuver or flee. +- Stage 5: lower progress reward scale and higher chirp cost. + +Config knobs: + +- `obstacle_min_size` +- `obstacle_max_size` +- `ear_separation_scale` +- `bat_max_speed` +- `bat_min_speed` +- `bat_accel` +- `bat_turn_rate` +- `sound_speed` +- `chirp_cooldown_ticks` +- `chirp_freq_bins` +- `chirp_duration_bins` +- `chirp_efficiency_reward` +- `chirp_overlap_penalty` +- `bug_echo_farther_penalty_scale` +- `step_cost` +- `progress_reward_scale` +- `collision_penalty` +- `curriculum_initial_level` +- `curriculum_stage` + +## PufferLib Integration + +Expected files after spec approval: + +- `ocean/bat/bat.h` +- `ocean/bat/bat.c` +- `ocean/bat/binding.c` +- `ocean/bat/tests/` +- `config/bat.ini` + +Follow the Breakout-style native env shape: + +- Define `Log`. +- Define env struct `Bat`. +- Store required pointers: + - `float* observations` + - `float* actions` + - `float* rewards` + - `float* terminals` + - `int num_agents` + - `Log log` + - `unsigned int rng` +- In `binding.c`, start with: + - `OBS_SIZE 41` + - `NUM_ATNS 6` + - `ACT_SIZES {5, 3, 8, 8, 4, 2}` + - `OBS_TENSOR_T FloatTensor` + - `Env Bat` + +Testing expectations: + +- Unit tests for chirp parameter normalization. +- Unit tests for audible chirp waveform helper math. Rendering playback itself + stays a RayLib eval concern, not a training dependency. +- Unit tests for echo delay and per-tick frequency-bin placement. +- Unit tests for left/right ear asymmetry from azimuth. +- Unit tests for Doppler sign on approaching vs receding bug. +- Unit tests for collision and catch termination. +- Unit tests for progress reward sign. +- Unit tests that wall collision returns `-1.0` and terminates. +- Unit tests that obstacle reflectors create boundary-approach signals. + +## Open Design Questions + +Reward shaping: + +- The first implementation should use the default shaping constants above. +- After the first trainability pass, decide whether to clip progress reward, + anneal privileged progress reward down, or increase chirp cost. + +Acoustic representation: + +- v1 uses 16 current-tick frequency-intensity bins per ear. +- A later variant can test more frequency bins, a flattened range-Doppler grid, + or literal FFT bins. + +Bug behavior: + +- v1 starts with fixed-velocity bounce behavior. +- Later curriculum stages add seeded random turns and maneuvering. + +Obstacle reflections: + +- v1 samples walls and obstacle edges into point reflectors. +- Later variants can compare analytic segment reflections or multiple-bounce + reflections. + +## Training and Sweep Operations + +- Curriculum design notes are tracked in `BAT_CURRICULUM.md`. Keep that file + updated when changing level progression, difficulty metrics, or bug motion + rungs. +- Current curriculum cleanup is documented in `BAT_CURRICULUM.md`: level 0 + starts with no obstacles, chirp-budget pressure is separate from curriculum + difficulty, and curriculum difficulty uses distance/obstacles only. +- Keep `base_perf` as pure catch rate. Use composite `perf` as the sweep + objective. It rewards catching harder curriculum levels with fewer chirps + without changing in-episode reward shaping: + `perf = base_perf * curriculum_difficulty * chirp_perf`. +- `chirp_perf` uses a fixed 15-chirp reference: + `clamp(1.0 - chirps_emitted / 15.0, 0.05, 1.0)`. This intentionally gives + strong sweep-ranking separation between 10, 8, and 6 chirps. Do not multiply + `perf` by both `budget_difficulty` and `chirp_efficiency`; that made the + metric harder to reason about and double-counted chirp pressure. +- Keep `score` exported in `binding.c`. If the 31-metric cap is tight, drop + `episode_return` before dropping `score`; PufferLib reads `metrics["env/score"]` + when train workers finish. +- Reward terms are training scaffolding and should remain sweepable. `progress_reward_scale` is true-distance shaping and should usually stay below `bug_echo_reward_scale`, which is based on closer received bug reflections. +- Forward-only movement dynamics should be swept with bounded ranges: + `env.bat_max_speed` in `[8.0, 22.0]`, `env.bat_min_speed` in `[2.0, 6.0]`, + `env.bat_accel` in `[40.0, 90.0]`, and `env.bat_turn_rate` in `[4.0, 3pi]`. +- Do not remove the minimum forward speed invariant. If the bat can hover at + zero velocity, PPO can learn a bad local optimum where it avoids collision + and timeout-shapes instead of exploring movement. +- Bug-echo progress shaping must be gated on bat displacement. Closer bug + echoes can reward, and farther bug echoes can weakly penalize, but neither + should pay out when the bat has not moved enough since the prior bug echo. +- Acoustic scale terms should be swept before increasing model size. Current bounded acoustic sweep knobs are `env.sound_speed` in `[80.0, 180.0]` and `env.ear_separation_scale` in `[1.0, 3.0]`. +- The June 9, 2026 `bat1` sweep strongly improved after the forward-only + dynamics change. Best observed run was `sage-cherry-92` with `perf ~= 0.953`, + `SPS ~= 2.06M`, collision `~= 0.031`, and timeout `~= 0.016`. The old default + had higher SPS but poor `perf`, so use `perf` first and SPS only as a + tie-breaker. +- That sweep pushed several bounds upward: `bat_accel`, `bat_turn_rate`, + `sound_speed`, `ear_separation_scale`, `progress_reward_scale`, + `replay_ratio`, and often `ent_coef`. It pushed `step_cost` and + `valid_chirp_reward` down. Defaults in `config/bat.ini` now track the best + high-perf region rather than the highest-SPS failed default. +- Train workers should use CUDA with `--train.gpus 1`. +- Protein/sweep control does not need CUDA. Run sweeps with `--sweep.use-gpu ""` so the optimizer stays off CUDA and avoids CUDA IPC/resource-handle failures. +- Do not override training duration with ad hoc `--train.total-timesteps`. Put duration ranges in `config/bat.ini`. +- Keep Bat sweep ranges bounded so a sweep cannot accidentally launch huge slow models. +- Do not use `sweep_only` in Bat config. Keep the config clean and bound the + actual sweep sections/defaults instead. +- The default Bat sweep does not sweep policy model size; it keeps `policy.hidden_size = 128` and `policy.num_layers = 4`. Current cost-sensitive sweep bounds cap training duration at `50_000_000`, rollout horizon at `128`, replay ratio at `1.25`, and `vec.num_buffers` at `8`. +- Do not add broad model-size sweep ranges. If model size must be swept later, require explicit human approval and keep a hard ceiling of `policy.hidden_size <= 256` and `policy.num_layers <= 4` unless there is a measured SPS reason to widen it. +- Keep PufferLib core stock for Bat. If sweep parsing conflicts with inherited default sweep keys, solve it through Bat config or command-line args, not core edits. +- Checkpoints trained before the forward-only action model are stale. After + changing action dimensions or movement semantics, run a normal `train bat` + before `eval bat --load-model-path latest`. +- On this PufferLib branch, `sweep bat --sweep.max-runs 2` is not enough to + exercise suggested hyperparameters: the first two launched experiments use + the current config defaults, and `sweep_obj.suggest(...)` is only called for + later runs. Use at least `--sweep.max-runs 3` for one actual suggestion, or + run explicit bounded comparison trains when testing a small acoustic grid. +- Curriculum difficulty should not advance on a single lucky catch. `env.curriculum_successes_per_level` gates advancement so each env must catch the bug multiple times at the current level before increasing bug distance or obstacle count. + +## Near-Term Roadmap + +Keep these changes small and reversible. Use TDD for env behavior changes, +train/eval after each rung, and commit each known-good rung separately. + +1. Harder curriculum and eval difficulty. + - Plain eval starts from a fresh env at curriculum level 0, so the bug can + look too close even when training eventually reaches harder levels. + - Add a configurable initial curriculum level so eval can start at a + representative harder level without requiring manual in-session catches. + - Increase the maximum curriculum bug distance so longer runs can keep + getting harder after the current successful range. + - Preserve monotonic progress: once an env advances above the configured + initial level, resets must not drop it back down. + +2. Finite chirp budget. + - Keep the low-curriculum budget below the old `20`-chirp setting; `20` + proved too easy and should not be part of the default sweep. + - Keep the chirp budget fixed across curriculum levels. Harder levels and + clutter legitimately need reacquisition chirps, so budget decay made + later levels fail for the wrong reason. + - Track `chirps_used / chirp_budget` as a normalized `0..1` observation. + - When the budget is exhausted, terminate with a `-1.0` failure penalty if + the policy attempts another chirp. Do not terminate immediately after the + last valid chirp, so the final echo can still matter. + - Keep chirp-use pressure visible through `chirps_emitted`, `chirp_perf`, + and the `chirps_used / chirp_budget` observation. Do not export duplicate + budget-ratio logs unless a future sweep needs them. + - Add a sweepable solve-time efficiency reward where spending the full + budget scores `0.5` on the efficiency component and using very few chirps + approaches `1.0`. + +3. Later bug motion curriculum. + - Keep the current fixed-velocity bounce bug as the base rung. + - Add later stages for sine/cosine perturbations, circular/arc paths, and + simple maneuvers. + - Sweep bug speed and maneuver amplitude only after harder curriculum and + chirp budget are stable. diff --git a/PR_AND_MERGE.md b/PR_AND_MERGE.md new file mode 100644 index 0000000000..acbf1051d9 --- /dev/null +++ b/PR_AND_MERGE.md @@ -0,0 +1,121 @@ +# Bat PR And Merge Notes + +## Determinism Terms + +- **Old-baseline trajectory equivalence**: same code/config/seed reproduces the + current exact training trajectory, scalar signature, checkpoint behavior, and + level-10 eval behavior. +- **Deterministic reproducibility**: same code/config/seed reproduces the same + result after we intentionally change behavior. + +For cleanup before the first merge, preserve old-baseline trajectory equivalence +unless we explicitly decide a change belongs in the new deterministic baseline. +Later behavior-breaking cleanups are allowed, but each one needs a fresh +reproducible training/eval signature. + +## Most Embarrassing Review Targets + +| Area | Why Joseph might call it out | Cleanup class | +| --- | --- | --- | +| `c_step()` terminal/reward flow | Over-budget chirp, collision, success, and timeout are detected in different branches, then partially consolidated later. It is correct enough, but less direct than Breakout/Boxoban/G2048. | Try to preserve old baseline first; larger reshaping may break it. | +| `compute_observations()` side effects | The name says observations, but it also consumes echo buckets and sets `tick_bug_echo_path`, which later affects reward. This is now direct in the function instead of hidden behind a one-use helper. | Preserve old baseline unless the echo reward order is deliberately changed. | +| `schedule_echo()` size | It mixes heading math, ear directivity, path/range checks, Doppler, attenuation, and queue writes in one function. | Preserve old baseline by extracting repeated left/right queueing only. | +| `reset_bug_motion()` and `update_bug()` | Three maneuver modes, inbound special cases, sign state, bounce repair, and multiple curriculum helpers are too much. A single sine wave with curriculum-ramped amplitude would be cleaner. | New deterministic baseline. This will likely break old behavior. | +| Spawn helpers | Exact-distance spawn and fallback quadrant spawn now live in one function, but the fallback loops remain defensive. | New deterministic baseline if RNG order changes. | +| Obstacle generation | `rects_overlap()`, `obstacle_clear()`, 96 attempts, and fallback placements are probably more safety than we need. | New deterministic baseline. Remove if overlapping random obstacles are acceptable. | +| Curriculum difficulty logs | `curriculum_distance_difficulty()`, `curriculum_obstacle_difficulty()`, `curriculum_motion_difficulty()`, and `curriculum_difficulty()` are a lot of code for diagnostics/objective shaping. | Likely behavior/metric breaking; do later. | +| Chirp efficiency / chirps-used logs | `chirps_used_ratio()` is still an observation, but reward/log helpers around sparse chirping are low conviction. | Keep observation if needed; remove reward/log parts in new baseline. | +| Demo defaults in `bat.c` | `set_demo_defaults()` duplicates `config/bat.ini`, which can drift. | Cleanup after deciding how the human demo path should load defaults. | +| Magic constants | Echo strengths, attenuation constants, spawn attempts, obstacle margins, first chirp defaults, and render colors are mostly unnamed. | Rename constants where it clarifies intent; avoid sweeping constant churn. | + +## Preserve Old Baseline First + +These changes should be attempted one at a time with the full gate: + +1. Reduce duplicated left/right queueing in `schedule_echo()`. +2. Keep `compute_observations()` order stable. Echo bucket observation copying is + now direct in the function. +3. Simplify local renderer helpers and repeated static reflector drawing. +4. Remove dead fields, dead constants, and obviously unreachable guards. +5. Keep `c_step()` reward order stable unless we deliberately decide to break + old-baseline trajectory equivalence. + +Gate after each code change: + +```bash +source .venv/bin/activate && ./build.sh bat +source .venv/bin/activate && bash ocean/bat/tests/run_all.sh +.venv/bin/python -m pufferlib.pufferl train bat --train.gpus 1 +timeout 45s env DISPLAY=:0 .venv/bin/python -m pufferlib.pufferl eval bat --load-model-path latest --env.curriculum-initial-level 10 --env.curriculum-successes-per-level 1000000 +``` + +Known old-baseline training signature: + +- `perf 0.556` +- `base_perf 0.950` +- `timeout 0.009` +- `chirps_emitted 5.191` + +## Current Safe-Cleanup Notes + +- Keep the normal `init()`, `allocate()`, `c_close()`, and `free_allocated()` + shape. Breakout and G2048 use this pattern too, even when Bat currently has + less heap-owned state after obstacle arrays became fixed-size. +- Breakout resets `terminals[0]` and `rewards[0]` at the top of `c_step()`; + Boxoban increments `tick`, clears terminal/reward, then handles success and + timeout as separate early-return branches. Bat's current terminal block is + somewhat more abstract, but changing the reward/terminal order belongs in the + new-baseline phase unless we intentionally stop matching the old run. +- Avoid changing reward arithmetic order in `c_step()` while preserving the old + baseline. The one-line reward fold already proved it can break trajectory + equivalence. +- `schedule_ear_echo()` is worth keeping for now. It is a small helper that + removed duplicated left/right attenuation and receive-time logic. +- `norm_bin()` is small, but it names the action-bin normalization used by three + chirp fields. Inlining it would save little and may make `try_emit_chirp()` + less readable. +- `compute_observations()` now copies due echo buckets directly into + observations and updates `tick_bug_echo_path` in place. Build, tests, + training signature, and level-10 eval all preserved the old baseline. +- The fallback spawn cleanup removed the one-use `sample_spawns()` helper and + kept the same RNG order inside `sample_spawns_at_distance()`. Build, tests, + training signature, and level-10 eval all preserved the old baseline. +- Chirp slice scheduling no longer pre-fills every future source slot at emit + time, and constructs the per-slice echo source explicitly instead of copying + the whole `ChirpEvent`. Build, tests, training signature, and level-10 eval + all preserved the old baseline. +- Expected bug echo timing now reads the just-emitted chirp source directly + instead of asking for slice `0` before any slices are scheduled. Build, tests, + training signature, and level-10 eval all preserved the old baseline. + +## Next Old-Baseline Candidates + +These are candidates only after the latest visual gate is confirmed: + +1. Revisit any remaining one-use render helpers, but only if removal reduces + lines without making `draw_freq_history_panel()` harder to scan. +2. Look for dead test-only exposure caused by removed helpers. The tests should + assert behavior, not preserve helpers just because they were previously + callable. +3. Review tiny math helpers one at a time. Keep helpers that name a real domain + concept (`chirp_slice_ticks`, `chirp_age_norm_denominator`); consider + inlining helpers that merely restate one field expression. +4. Leave `c_step()` structural reshaping for later. It is one of the highest + review-value areas, but it is also one of the easiest ways to break old + trajectory equivalence. + +## New Deterministic Baseline Later + +These are probably the real pre-merge quality wins, but they should be grouped +after we are ready to stop matching the current trajectory exactly: + +1. Replace bug maneuver modes with one always-active sine-wave path and + curriculum-ramped amplitude. +2. Simplify spawn and obstacle generation, including removing overlap checks if + overlapping obstacles are acceptable. +3. Remove low-conviction curriculum difficulty and chirp efficiency logs/reward + shaping. +4. Rework `c_step()` into a direct step, reward, done, log/reset shape matching + the simpler reference envs. +5. Reconsider the initial chirp observation defaults instead of pretending a + chirp happened before the episode starts. diff --git a/config/bat.ini b/config/bat.ini new file mode 100644 index 0000000000..b66260d9cd --- /dev/null +++ b/config/bat.ini @@ -0,0 +1,295 @@ +[base] +env_name = bat + +[vec] +total_agents = 4096 +num_buffers = 4 +num_threads = 8 + +[policy] +hidden_size = 256 +num_layers = 5 +expansion_factor = 1 + +[torch] +network = MinGRU +encoder = DefaultEncoder +decoder = DefaultDecoder + +[env] +max_speed = 15.498233877318418 +min_speed = 2.6389946132676654 +accel = 53.02330161128345 +turn_rate = 8.371655963408276 +render_target_fps = 60 +record_video = 0 +record_video_fps = 30 +record_video_seconds = 30 +record_video_audio = 1 +bug_echo_farther_penalty_scale = 0.19351291407677712 +bug_echo_reward_scale = 0.35 +bug_wing_sideband_gain = 0.19056934455600955 +curriculum_initial_level = 1 +curriculum_obstacle_step = 8 +curriculum_start_bug_distance = 8.438008720355143 +curriculum_successes_per_level = 4 +ear_separation_scale = 2.0 +ear_rear_gain = 0.22038613968607276 +ear_front_gain = 0.6419214149115183 +ear_side_gain = 0.28043867572747055 +early_chirp_penalty = 0.006 +progress_reward_scale = 0.12 +reflector_strength = 0.6 +sound_speed = 180.0 +step_cost = 0.00010781401476030468 +valid_chirp_reward = 0.00015478540834814922 +chirp_cooldown_ticks = 11 +chirp_efficiency_reward = 2.0 +chirp_overlap_penalty = 0.004278154705335052 +collision_penalty = 1.950717141233687 + +[train] +anneal_ent_coef = 0 +anneal_lr = 1 +beta1 = 0.6151083880184249 +beta2 = 0.9994430814361022 +clip_coef = 0.6358533174485217 +ent_coef = 0.0012852601662540154 +eps = 6.005678002222838e-10 +gae_lambda = 0.9208599830048286 +gamma = 0.99842121229845 +gpus = 1 +horizon = 64 +learning_rate = 0.011919361446426807 +max_grad_norm = 1.5952002930880629 +min_ent_coef_ratio = 0.1 +min_lr_ratio = 0 +minibatch_size = 8192 +prio_alpha = 0.9144113738603952 +prio_beta0 = 1.0 +replay_ratio = 1.1554225446340287 +seed = 42 +total_timesteps = 33699113.0 +vf_clip_coef = 0.01 +vf_coef = 5.0 +vtrace_c_clip = 2.7120354439967884 +vtrace_rho_clip = 5.0 + +[sweep] +method = Protein +metric = perf +metric_distribution = linear +goal = maximize +gpus = 1 +downsample = 5 +use_gpu = True + +[sweep.train.total_timesteps] +distribution = log_normal +min = 30_000_000 +max = 50_000_000 +scale = auto + +[sweep.policy.hidden_size] +distribution = uniform_pow2 +min = 64 +max = 512 +scale = auto + +[sweep.policy.num_layers] +distribution = int_uniform +min = 2 +max = 5 +scale = auto + +[sweep.train.learning_rate] +distribution = log_normal +min = 0.01 +max = 0.05 +scale = auto + +[sweep.train.gamma] +distribution = logit_normal +min = 0.98 +max = 0.9995 +scale = auto + +[sweep.train.gae_lambda] +distribution = logit_normal +min = 0.92 +max = 0.98 +scale = auto + +[sweep.train.ent_coef] +distribution = log_normal +min = 0.0005 +max = 0.03 +scale = auto + +[sweep.train.horizon] +distribution = uniform_pow2 +min = 64 +max = 256 +scale = auto + +[sweep.train.replay_ratio] +distribution = uniform +min = 0.75 +max = 1.25 +scale = auto + +[sweep.vec.num_buffers] +distribution = int_uniform +min = 4 +max = 8 +scale = auto + +[sweep.env.max_speed] +distribution = uniform +min = 8.0 +max = 30.0 +scale = auto + +[sweep.env.min_speed] +distribution = uniform +min = 2.0 +max = 6.0 +scale = auto + +[sweep.env.accel] +distribution = uniform +min = 40.0 +max = 90.0 +scale = auto + +[sweep.env.turn_rate] +distribution = uniform +min = 4.0 +max = 9.4247780 +scale = auto + +[sweep.env.step_cost] +distribution = uniform +min = 0.0001 +max = 0.0008 +scale = auto + +[sweep.env.sound_speed] +distribution = uniform +min = 80.0 +max = 180.0 +scale = auto + +[sweep.env.ear_separation_scale] +distribution = uniform +min = 1.0 +max = 2.0 +scale = auto + +[sweep.env.ear_rear_gain] +distribution = uniform +min = 0.10 +max = 0.30 +scale = auto + +[sweep.env.ear_front_gain] +distribution = uniform +min = 0.40 +max = 0.75 +scale = auto + +[sweep.env.ear_side_gain] +distribution = uniform +min = 0.20 +max = 0.55 +scale = auto + +[sweep.env.reflector_strength] +distribution = uniform +min = 0.6 +max = 3.0 +scale = auto + +[sweep.env.progress_reward_scale] +distribution = uniform +min = 0.04 +max = 0.12 +scale = auto + +[sweep.env.valid_chirp_reward] +distribution = uniform +min = 0.0 +max = 0.0015 +scale = auto + +[sweep.env.chirp_efficiency_reward] +distribution = uniform +min = 0.0 +max = 2.0 +scale = auto + +[sweep.env.early_chirp_penalty] +distribution = uniform +min = 0.001 +max = 0.006 +scale = auto + +[sweep.env.chirp_cooldown_ticks] +distribution = int_uniform +min = 6 +max = 18 +scale = auto + +[sweep.env.chirp_overlap_penalty] +distribution = uniform +min = 0.001 +max = 0.030 +scale = auto + +[sweep.env.bug_echo_reward_scale] +distribution = uniform +min = 0.02 +max = 0.35 +scale = auto + +[sweep.env.bug_echo_farther_penalty_scale] +distribution = uniform +min = 0.05 +max = 0.20 +scale = auto + +[sweep.env.bug_wing_sideband_gain] +distribution = uniform +min = 0.05 +max = 0.25 +scale = auto + +[sweep.env.collision_penalty] +distribution = uniform +min = 0.5 +max = 2.0 +scale = auto + +[sweep.env.curriculum_initial_level] +distribution = int_uniform +min = 0 +max = 5 +scale = auto + +[sweep.env.curriculum_start_bug_distance] +distribution = uniform +min = 8.0 +max = 20.0 +scale = auto + +[sweep.env.curriculum_obstacle_step] +distribution = int_uniform +min = 3 +max = 8 +scale = auto + +[sweep.env.curriculum_successes_per_level] +distribution = int_uniform +min = 4 +max = 16 +scale = auto diff --git a/config/default.ini b/config/default.ini index 29bc1808b7..5bec213815 100644 --- a/config/default.ini +++ b/config/default.ini @@ -100,13 +100,6 @@ downsample = 5 use_gpu = True prune_pareto = True early_stop_quantile = 0.3 -# When set, each sweep trial is scored by winrate in a match against a fixed -# enemy checkpoint rather than by the training-time env/score. Score key emitted -# as env/match_score; set match_enemy_model_path to '' to disable. -match_enemy_model_path = '' -match_num_games = 1024 -match_enemy_hidden_size = 0 -match_enemy_num_layers = 0 [sweep.train.total_timesteps] distribution = log_normal diff --git a/ocean/bat/BAT_EAR_DIRECTIVITY_RESEARCH.md b/ocean/bat/BAT_EAR_DIRECTIVITY_RESEARCH.md new file mode 100644 index 0000000000..ca03e876c5 --- /dev/null +++ b/ocean/bat/BAT_EAR_DIRECTIVITY_RESEARCH.md @@ -0,0 +1,288 @@ +# Bat ear directivity research notes + +Purpose: preserve research and implementation guidance for a possible low-cost directional hearing model in `ocean/bat/`. + +Status: research/design note only. No behavior change is implied by this document. + +## Short answer + +Yes, the model should not treat each ear as an omnidirectional scalar receiver. Bat echolocation uses directional emission, directional reception, and binaural differences. The useful terms are: + +- `HRTF`: head-related transfer function, the direction-dependent filtering from a sound source to each ear. +- `HRIR`: time-domain head-related impulse response. +- `ILD`: interaural level difference, the loudness/intensity difference between left and right ears. +- `ITD`: interaural time difference, the arrival-time difference between ears. +- `Pinna directivity`: direction-dependent gain/filtering caused by the external ear shape. +- `Beam pattern` or `polar response`: gain as a function of angle. +- `Acoustic field of view`: the spatial volume that is ensonified or heard well enough for detection. + +For Bat env purposes, the best first implementation is a cheap per-ear gain curve in `bat_schedule_echo`, based on relative angle to target/obstacle/echo source. It should use dot products and multiplications, not `atan2f`, not tables, and not per-frequency filters. + +## What the literature says + +### Bats have directional sonar emission and dynamic beam width + +Jakobsen, Ratcliffe, and Surlykke found that multiple vespertilionid species converge on similar sonar fields of view. The Nature abstract reports a directivity index around `11 +/- 1 dB`, half-amplitude angle about `37 degrees`, and on-axis source level around `108 +/- 4 dB SPL re 20 uPa rms at 10 cm` under their tested condition. + +Source: + +- Jakobsen, L.; Ratcliffe, J. M.; Surlykke, A. `Convergent acoustic field of view in echolocating bats`. Nature 493, 93-96, 2013. DOI: https://doi.org/10.1038/nature11664 +- Nature page: https://www.nature.com/articles/nature11664 + +Implementation relevance: + +- The environment already has directional structure via left/right echo channels, but the hearing side can plausibly become more directional. +- A simple polar response is justified: forward is strong, rear is weak, lateral differs by ear. +- A 2D game does not need full 3D HRTF. The important behavioral signal is `left/right relative energy`, not spectral notches. + +### Directionality and intensity jointly define what the bat can detect + +Jakobsen, Brinklov, and Surlykke reviewed bat echolocation intensity and directionality. Key implementation-relevant points: + +- Bat calls are directional; more energy is focused forward than to the sides. +- An object detectable directly in front at a given range may not be detectable at the same range off-axis. +- Directionality reduces clutter because less energy is emitted to the sides/back. +- Beam shape acts as a spatial filter before echoes return. +- Bats dynamically control intensity, duration, frequency, and directionality. +- Nose emitters can have beam shape affected by nostril separation and noseleaf geometry. +- Mouth emitters can affect directionality via gape size. + +Source: + +- Jakobsen, L.; Brinklov, S.; Surlykke, A. `Intensity and directionality of bat echolocation signals`. Frontiers in Physiology 4:89, 2013. DOI: https://doi.org/10.3389/fphys.2013.00089 +- Open full text: https://pmc.ncbi.nlm.nih.gov/articles/PMC3635024/ + +Implementation relevance: + +- If we add hearing directivity, it should be part of the echo energy calculation, not an observation post-process. +- It should affect both bug and obstacle echoes consistently. +- We should keep it cheap enough to run per echo/event/source. + +### Bats can broaden beams in terminal pursuit + +Jakobsen and Surlykke showed that `Myotis daubentonii` and `Eptesicus serotinus` broaden their biosonar beam during prey pursuit. Crossref metadata includes the useful quantitative anchor: `M. daubentonii` increased half-amplitude angle from about `40 degrees` to about `90 degrees` horizontally and from about `45 degrees` to more than `90 degrees` vertically, mostly by dropping call frequency by about one octave from `55 kHz` to `27.5 kHz`. + +Source: + +- Jakobsen, L.; Surlykke, A. `Vespertilionid bats control the width of their biosonar sound beam dynamically during prey pursuit`. PNAS 107(31), 13930-13935, 2010. DOI: https://doi.org/10.1073/pnas.1006630107 +- PNAS page: https://www.pnas.org/doi/10.1073/pnas.1006630107 + +Implementation relevance: + +- This is more about emission than reception, but it argues against a single static omnidirectional model. +- We do not need to implement dynamic beam width yet. It would be a meaningful physics change and should be isolated in a sweep. +- If implemented later, chirp duration/frequency choices could alter beam width. That would make action consequences richer, but it is not the minimum ear-directivity change. + +### Reception-side filtering matters too + +Wotton, Jenison, and Hartley modeled/combined emission and external-ear reception in the big brown bat. Their abstract says localization cues become clearer when emission spectra and external-ear spectra are convolved; spectral peaks sharpen and peak/notch contrast increases. It also notes cues restricted to a cone of about `+/-30 degrees`. + +Source: + +- Wotton, J. M.; Jenison, R. L.; Hartley, D. J. `The combination of echolocation emission and ear reception enhances directional spectral cues of the big brown bat, Eptesicus fuscus`. JASA 101(3), 1723-1733, 1997. DOI: https://doi.org/10.1121/1.418271 +- AIP/JASA page: https://pubs.aip.org/asa/jasa/article/101/3/1723/559358/The-combination-of-echolocation-emission-and-ear + +Implementation relevance: + +- Full spectral filtering is overkill for current Bat. The obs are low-dimensional echo features, not raw waveforms. +- A cheap gain curve per ear captures the important part for policy learning: direction-dependent intensity. +- Avoid adding FFTs, filters, or per-frequency HRTF tables unless the environment changes to raw audio observations. + +### Noseleaf and pinnae can cooperate dynamically + +Kuc proposed a model where noseleaf and pinnae cooperate through direct and delayed acoustic paths. The abstract says the delayed pinna component can increase on-axis emission strength, narrow beam width, and sculpt frequency-dependent beam patterns. + +Source: + +- Kuc, R. `Morphology suggests noseleaf and pinnae cooperate to enhance bat echolocation`. JASA 128(5), 3190-3199, 2010. DOI: https://doi.org/10.1121/1.3488304 +- AIP/JASA page: https://pubs.aip.org/asa/jasa/article/128/5/3190/917806/Morphology-suggests-noseleaf-and-pinnae-cooperate + +Zhang et al. studied great roundleaf bats and found coordinated noseleaf and pinna movements during echolocation. + +Source: + +- Zhang, S.; et al. `Dynamic relationship between noseleaf and pinnae in echolocating hipposiderid bats`. Journal of Experimental Biology, 2019. DOI: https://doi.org/10.1242/jeb.210252 +- JEB page: https://journals.biologists.com/jeb/article/222/20/jeb210252/224403/Dynamic-relationship-between-noseleaf-and-pinnae-in + +Vanderelst et al. found that the noseleaf of `Rhinolophus formosae` focuses the FM component of calls. + +Source: + +- Vanderelst, D.; Lee, Y.-F.; Geipel, I.; Kalko, E. K. V.; Kuo, Y.-M.; Peremans, H. `The noseleaf of Rhinolophus formosae focuses the Frequency Modulated (FM) component of the calls`. Frontiers in Physiology 4:191, 2013. DOI: https://doi.org/10.3389/fphys.2013.00191 +- Frontiers page: https://www.frontiersin.org/articles/10.3389/fphys.2013.00191/full + +Implementation relevance: + +- These papers support a directional receive model, but they also warn that exact geometry is species-specific and complex. +- For Bat env, do not model moving pinnae/noseleaf first. That would create extra state and new parameters without proving learning benefit. +- Keep the first model static and symmetric, then sweep it. + +## Current likely Bat code location + +The directivity should probably be applied in or near the echo scheduling/energy path, around the existing left/right echo gain logic. Earlier review found a mild directional term like this in `bat_schedule_echo`: + +```c +float left_gain = 0.75f + 0.25f * something; +float right_gain = 0.75f + 0.25f * something; +``` + +That is a weak directional receiver. A stronger, biologically motivated model would replace that with a front-and-side polar response. + +Do not add this in render/audio code. The training observation echo energy must change, not only playback. + +## Recommended cheap implementation + +Use only normalized source direction and bat forward/side vectors. No angle, no trig. + +Definitions: + +- `ux, uy`: unit vector from bat to echo source. +- `fx, fy`: bat forward unit vector. +- `lx, ly`: bat left-ear preferred lateral unit vector, usually left of forward. +- `rx, ry`: bat right-ear preferred lateral unit vector, usually right of forward. +- `front`: nonnegative forward alignment. +- `left_side`: nonnegative left-ear side alignment. +- `right_side`: nonnegative right-ear side alignment. +- `rear_floor`: minimum rear sensitivity so rear echoes are not impossible. + +Sketch: + +```c +float front = bat_clampf(ux*fx + uy*fy, 0.0f, 1.0f); +float left_side = bat_clampf(ux*lx + uy*ly, 0.0f, 1.0f); +float right_side = bat_clampf(ux*rx + uy*ry, 0.0f, 1.0f); + +float front2 = front * front; +float left2 = left_side * left_side; +float right2 = right_side * right_side; + +float left_gain = rear_floor + front_gain*front2 + side_gain*left2; +float right_gain = rear_floor + front_gain*front2 + side_gain*right2; +``` + +Potential initial constants: + +```c +#define BAT_EAR_REAR_GAIN 0.15f +#define BAT_EAR_FRONT_GAIN 0.55f +#define BAT_EAR_SIDE_GAIN 0.45f +``` + +Normalize if needed: + +```c +#define BAT_EAR_GAIN_NORM (1.0f / (BAT_EAR_REAR_GAIN + BAT_EAR_FRONT_GAIN + BAT_EAR_SIDE_GAIN)) +left_gain *= BAT_EAR_GAIN_NORM; +right_gain *= BAT_EAR_GAIN_NORM; +``` + +This keeps max gain near `1.0`, gives front-left stronger left signal, front-right stronger right signal, and keeps behind weak but nonzero. + +## Variant: ear axes angled forward + +Pure side vectors can make lateral echoes too strong compared with forward echoes. A better biological-ish 2D approximation is ears pointed outward but forward-biased. + +Given forward `f` and left normal `n`: + +```c +float ear_forward = 0.75f; +float ear_side = 0.66f; +float left_ear_x = ear_forward*fx + ear_side*nx; +float left_ear_y = ear_forward*fy + ear_side*ny; +float right_ear_x = ear_forward*fx - ear_side*nx; +float right_ear_y = ear_forward*fy - ear_side*ny; +``` + +If `ear_forward^2 + ear_side^2` is approximately `1`, no normalization needed. `0.75/0.66` is close enough for a cheap model. + +Then: + +```c +float left_lobe = bat_clampf(ux*left_ear_x + uy*left_ear_y, 0.0f, 1.0f); +float right_lobe = bat_clampf(ux*right_ear_x + uy*right_ear_y, 0.0f, 1.0f); +left_gain = rear_floor + main_gain * left_lobe * left_lobe; +right_gain = rear_floor + main_gain * right_lobe * right_lobe; +``` + +This is even simpler and likely enough. + +## Performance considerations + +Good: + +- Dot products. +- Multiplication for squaring. +- `bat_clampf` or inline clamp. +- Constants as `#define`. + +Avoid: + +- `atan2f` per echo. +- `cosf`/`sinf` per echo if forward/side vectors already exist. +- Per-frequency HRTF tables. +- New heap allocations. +- Raw audio convolution. + +The model should cost only a few multiplies per scheduled echo. + +## Expected behavior change + +Likely effects: + +- Better left/right spatial signal when target is off-center. +- Rear obstacles/bugs become less audible. +- Policy may learn to turn/scan because facing matters more. +- Existing trained checkpoint performance may change because observations change. + +Potential risk: + +- If rear/side gain is too low, exploration may get harder. +- If gains are not normalized, reward/observation scale may drift. +- If directivity is applied on top of an already strong directional term, left/right energy may saturate. + +## Sweep recommendation + +Do not combine this with wing micro-Doppler in the same first sweep. Use a clean ablation: + +- Baseline: current Bat after timer/log/audio cleanup. +- Variant A: static ear directivity only. +- Variant B: wing sidebands only. +- Variant C: both, only if A and B individually help or at least do not hurt. + +Suggested parameters for first sweep: + +```ini +[env] +ear_directivity_enabled = 1 +ear_rear_gain = 0.15 +ear_front_gain = 0.55 +ear_side_gain = 0.45 +``` + +If avoiding config bloat, hard-code the first constants behind defines and sweep by branch/commit instead. + +## Implementation checklist + +- Apply directivity before writing echo energy into observations. +- Apply to bug and obstacle echoes unless there is a specific reason not to. +- Keep left/right symmetry exact. +- Keep max gain normalized near current max so observation scale does not drift hard. +- Add one focused C test for left/right asymmetry if tests are desired. +- Run build/tests/train/eval before comparing performance. + +## Source list + +- https://doi.org/10.1038/nature11664 +- https://www.nature.com/articles/nature11664 +- https://doi.org/10.3389/fphys.2013.00089 +- https://pmc.ncbi.nlm.nih.gov/articles/PMC3635024/ +- https://doi.org/10.1073/pnas.1006630107 +- https://www.pnas.org/doi/10.1073/pnas.1006630107 +- https://doi.org/10.1121/1.418271 +- https://pubs.aip.org/asa/jasa/article/101/3/1723/559358/The-combination-of-echolocation-emission-and-ear +- https://doi.org/10.1121/1.3488304 +- https://pubs.aip.org/asa/jasa/article/128/5/3190/917806/Morphology-suggests-noseleaf-and-pinnae-cooperate +- https://doi.org/10.1242/jeb.210252 +- https://journals.biologists.com/jeb/article/222/20/jeb210252/224403/Dynamic-relationship-between-noseleaf-and-pinnae-in +- https://doi.org/10.3389/fphys.2013.00191 +- https://www.frontiersin.org/articles/10.3389/fphys.2013.00191/full diff --git a/ocean/bat/BAT_NEXT_SWEEP_RESEARCH_NOTES.md b/ocean/bat/BAT_NEXT_SWEEP_RESEARCH_NOTES.md new file mode 100644 index 0000000000..36086018ab --- /dev/null +++ b/ocean/bat/BAT_NEXT_SWEEP_RESEARCH_NOTES.md @@ -0,0 +1,135 @@ +# Bat next sweep research notes + +Purpose: concise decision notes for future agents before changing Bat physics. + +Status: planning note only. + +## Current baseline to preserve first + +Before adding new physics, commit and sweep the current Bat state that already includes: + +- Timer observation normalized `0..1`. +- Timeout terminal value as `-1.0`. +- Chirp usage normalized `0..1` with death/termination if exceeding the allowed budget. +- Reward/log cleanup. +- Recording code moved out of `bat.h`. +- Audio helpers moved out of `bat.h`. + +Reason: ear directivity and wing micro-Doppler are real behavior changes. They should not be mixed into the baseline sweep used to judge timer/log/audio cleanup. + +## Candidate A: static ear directivity + +Add a cheap polar response for each ear. + +Expected benefit: + +- Stronger left/right spatial cue. +- Facing direction matters more. +- Rear echoes become weaker. + +Main risk: + +- Exploration may become harder if rear/side gain is too low. +- Observation scale may drift if gains are not normalized. + +Recommended first form: + +```c +float front = clamp(dot(source_dir, forward), 0, 1); +float left = clamp(dot(source_dir, left_ear_dir), 0, 1); +float right = clamp(dot(source_dir, right_ear_dir), 0, 1); +left_gain = rear_floor + front_gain*front*front + side_gain*left*left; +right_gain = rear_floor + front_gain*front*front + side_gain*right*right; +``` + +Suggested constants: + +```c +#define BAT_EAR_REAR_GAIN 0.15f +#define BAT_EAR_FRONT_GAIN 0.55f +#define BAT_EAR_SIDE_GAIN 0.45f +#define BAT_EAR_GAIN_NORM (1.0f / (BAT_EAR_REAR_GAIN + BAT_EAR_FRONT_GAIN + BAT_EAR_SIDE_GAIN)) +``` + +Research doc: + +- `ocean/bat/BAT_EAR_DIRECTIVITY_RESEARCH.md` + +## Candidate B: bug wing echo sideband + +Add prey-specific wing flutter echo structure. + +Expected benefit: + +- Bug echoes become distinguishable from obstacle echoes. +- Adds a moving-prey cue without raw audio simulation. + +Main risk: + +- More echo events can saturate event capacity or observation bins. +- If energy is too low it adds no learnable signal; if too high it changes task scale. + +Recommended first form: + +- Keep body echo unchanged. +- Add one extra bug-only wing echo. +- Use triangle phase, no `sinf`. + +Suggested constants: + +```c +#define BAT_BUG_WING_ECHO_GAIN 0.20f +#define BAT_BUG_WING_FREQ_OFFSET 0.06f +#define BAT_BUG_WING_PHASE_STEP 0.11f +``` + +Research doc: + +- `ocean/bat/BAT_WING_ECHO_RESEARCH.md` + +## Sweep ordering + +1. Baseline current Bat. +2. Ear directivity only. +3. Wing sideband only. +4. Combined directivity + wing sideband only if individual variants are viable. + +Do not add both new physics changes before an ablation. It will make results ambiguous. + +## Success metrics to compare + +Use the same training/eval flow as the recent Bat work: + +- Build passes. +- Bat C tests pass. +- Training completes on current ini without timestep override. +- Compare `perf`, `base_perf`, `SPS`, `timeout`, and qualitative eval behavior. +- Level 5 eval should still look reasonable. + +Known recent baseline from audio-helper move: + +- `perf` around `0.375`. +- `base_perf` around `0.942`. +- `SPS` around `1.5M`. +- `timeout` around `0.001`. + +Do not overinterpret one training run. Use it as a regression/sanity check, then sweep. + +## Source anchors + +Ear directivity: + +- https://doi.org/10.1038/nature11664 +- https://doi.org/10.3389/fphys.2013.00089 +- https://doi.org/10.1073/pnas.1006630107 +- https://doi.org/10.1121/1.418271 +- https://doi.org/10.1121/1.3488304 +- https://doi.org/10.1242/jeb.210252 +- https://doi.org/10.3389/fphys.2013.00191 + +Wing echo / micro-Doppler: + +- https://doi.org/10.1007/BF00612592 +- https://doi.org/10.1098/rspb.2003.2487 +- https://doi.org/10.1098/rspb.2012.2830 +- https://doi.org/10.1037/bne0000315 diff --git a/ocean/bat/BAT_WING_ECHO_RESEARCH.md b/ocean/bat/BAT_WING_ECHO_RESEARCH.md new file mode 100644 index 0000000000..4e8695b51e --- /dev/null +++ b/ocean/bat/BAT_WING_ECHO_RESEARCH.md @@ -0,0 +1,288 @@ +# Bat insect-wing echo and micro-Doppler research notes + +Purpose: preserve research and implementation guidance for possible low-cost insect wing flutter / micro-Doppler echoes in `ocean/bat/`. + +Status: research/design note only. No behavior change is implied by this document. + +## Short answer + +Yes, insect prey should plausibly produce more than a single body echo. Flying insect wings can create echo fluctuations, amplitude modulation, and Doppler/micro-Doppler-like frequency structure. The simplest useful Bat env approximation is: + +- Keep the existing normal body echo. +- For bug echoes only, add one or two weaker wing echoes near the body echo. +- Make wing echoes vary over time with a cheap phase oscillator. +- Keep obstacle echoes unchanged. + +This should add a moving-prey signature without turning the environment into an expensive acoustic simulator. + +## Useful terminology + +- `Doppler shift`: frequency shift caused by relative motion between bat and target. +- `Micro-Doppler`: additional Doppler components from moving parts of a target, such as flapping wings, legs, rotors, or vibrating surfaces. +- `Flutter detection`: detecting oscillating target movements, especially insect wing motion, in echoes. +- `Amplitude modulation`: echo strength fluctuates as wing orientation and scattering cross-section change. +- `Spectral glints`: brief bright echo components from reflective target parts at favorable orientations. +- `Sidebands`: frequency components above and below a carrier/body frequency caused by modulation. + +## What the literature says + +### CF/CF-FM bats can use Doppler and flutter cues + +The classic result is Schnitzler and Flieger on greater horseshoe bats detecting oscillating target movement. Crossref metadata confirms the paper: + +- Schnitzler, H.-U.; Flieger, E. `Detection of oscillating target movements by echolocation in the Greater Horseshoe bat`. Journal of Comparative Physiology 153, 385-391, 1983. DOI: https://doi.org/10.1007/BF00612592 + +Secondary summaries and reviews describe the key idea: CF bats are especially suited to detecting target velocity and wing flutter as Doppler-shifted frequencies. Oscillating wings also create amplitude shifts that help distinguish flying prey from stationary targets. + +Implementation relevance: + +- Bat env currently uses chirps/echoes as compact observations. It does not need raw CF sonar. +- A cheap wing signature is still justified because it gives the policy a prey-specific temporal/frequency cue. +- Apply it only to `BAT_ECHO_BUG`, not walls/obstacles. + +### Echolocation range and wingbeat timing are behaviorally linked + +Holderied and von Helversen studied aerial-hawking bats and found a relationship between echolocation range and wingbeat period. + +Source: + +- Holderied, M. W.; von Helversen, O. `Echolocation range and wingbeat period match in aerial-hawking bats`. Proceedings of the Royal Society B 270, 2293-2299, 2003. DOI: https://doi.org/10.1098/rspb.2003.2487 +- Royal Society page: https://royalsocietypublishing.org/doi/10.1098/rspb.2003.2487 + +Implementation relevance: + +- Wingbeat dynamics are not just visual animation; they are related to sensing and prey pursuit timing. +- If Bat already has a tick-based model, wing phase can update once per tick using a fixed increment. +- No per-chirp expensive computation is needed. + +### Bats can classify prey shape/material from echo structure + +Geipel, Jung, and Kalko showed that `Micronycteris microtis` can detect, classify, and localize silent, motionless prey in clutter using echolocation alone. Their abstract says bats used short, multi-harmonic broadband calls and appeared to perceive a detailed acoustic image based on shape, surface structure, and material. + +Source: + +- Geipel, I.; Jung, K.; Kalko, E. K. V. `Perception of silent and motionless prey on vegetation by echolocation in the gleaning bat Micronycteris microtis`. Proceedings of the Royal Society B 280:20122830, 2013. DOI: https://doi.org/10.1098/rspb.2012.2830 +- Royal Society page: https://royalsocietypublishing.org/doi/10.1098/rspb.2012.2830 + +Implementation relevance: + +- Even without active wing motion, bugs are not acoustically equivalent to points. +- If adding wing sidebands, keep them as prey-specific echo complexity, not as general noise. +- This supports making bug echoes richer than obstacle echoes. + +### Micro-spectral ripple research supports compact target-specific echo features + +Shriram and Simmons studied bats perceiving natural-size targets as a unitary class using micro-spectral ripples in echoes. + +Source: + +- Shriram, U.; Simmons, J. A. `Echolocating bats perceive natural-size targets as a unitary class using micro-spectral ripples in echoes`. Behavioral Neuroscience 133(3), 297-304, 2019. DOI: https://doi.org/10.1037/bne0000315 +- APA page: https://doi.apa.org/doi/10.1037/bne0000315 + +Implementation relevance: + +- Richer echo spectra can matter, but Bat should not model detailed spectra first. +- A few deterministic sidebands are a cheap stand-in for target-specific microstructure. +- This is closer to a useful observation feature than raw acoustic realism. + +## Recommended cheap model + +### Core idea + +When scheduling a bug echo, add: + +- `body echo`: existing echo path, unchanged except for any directivity/range logic already present. +- `wing upper echo`: smaller energy, slightly higher normalized frequency. +- `wing lower echo`: smaller energy, slightly lower normalized frequency. + +The upper/lower echoes represent wing motion toward/away from the bat and modulation around the body return. + +Sketch: + +```c +float wing_phase = env->bug_wing_phase; +float wing = 0.5f + 0.5f * sinf(wing_phase); +float wing_offset = BAT_BUG_WING_FREQ_OFFSET * (0.5f + 0.5f * wing); +float wing_energy = body_energy * BAT_BUG_WING_ECHO_GAIN; + +bat_add_echo_event(env, echo_time, body_freq, body_energy, left_gain, right_gain, BAT_ECHO_BUG); +bat_add_echo_event(env, echo_time, body_freq + wing_offset, wing_energy, left_gain, right_gain, BAT_ECHO_BUG); +bat_add_echo_event(env, echo_time, body_freq - wing_offset, wing_energy, left_gain, right_gain, BAT_ECHO_BUG); +``` + +If avoiding `sinf`, use a triangle oscillator: + +```c +float phase = env->bug_wing_phase; +float tri = phase < 0.5f ? phase * 2.0f : (1.0f - phase) * 2.0f; +float wing_offset = BAT_BUG_WING_FREQ_OFFSET * tri; +``` + +Then update phase once per env step: + +```c +env->bug_wing_phase += BAT_BUG_WING_PHASE_STEP; +if (env->bug_wing_phase >= 1.0f) env->bug_wing_phase -= 1.0f; +``` + +Use a constant phase step instead of division per tick. If it needs to depend on tick rate, define the reciprocal as a constant. + +### Initial constants + +The actual values should be tuned by sweep, but a reasonable first pass: + +```c +#define BAT_BUG_WING_ECHO_GAIN 0.20f +#define BAT_BUG_WING_FREQ_OFFSET 0.06f +#define BAT_BUG_WING_PHASE_STEP 0.11f +``` + +Interpretation: + +- `BAT_BUG_WING_ECHO_GAIN`: each sideband gets 20% of body energy. +- `BAT_BUG_WING_FREQ_OFFSET`: normalized frequency offset, not real kHz. +- `BAT_BUG_WING_PHASE_STEP`: wing animation/sensing phase increment per env tick. + +If the two sidebands make total bug energy too high, compensate: + +```c +float body_energy = base_energy * 0.75f; +float wing_energy = base_energy * 0.125f; +``` + +This preserves total energy while adding structure. If the goal is to make bugs easier to identify, do not preserve total energy exactly; but then treat it as a real behavior change. + +## Cheaper one-sideband variant + +If three echo events per bug chirp is too much, use one extra echo whose sign flips with wing phase: + +```c +float tri = env->bug_wing_phase < 0.5f ? env->bug_wing_phase * 2.0f : (1.0f - env->bug_wing_phase) * 2.0f; +float sign = env->bug_wing_phase < 0.5f ? 1.0f : -1.0f; +float wing_freq = body_freq + sign * BAT_BUG_WING_FREQ_OFFSET * tri; +float wing_energy = body_energy * BAT_BUG_WING_ECHO_GAIN; + +bat_add_echo_event(env, echo_time, body_freq, body_energy, left_gain, right_gain, BAT_ECHO_BUG); +bat_add_echo_event(env, echo_time, wing_freq, wing_energy, left_gain, right_gain, BAT_ECHO_BUG); +``` + +This is half the extra event count. It gives time-varying high/low pings, but not simultaneous symmetric sidebands. + +## Even cheaper amplitude-only variant + +If we want no extra echo events, modulate bug echo energy: + +```c +float tri = env->bug_wing_phase < 0.5f ? env->bug_wing_phase * 2.0f : (1.0f - env->bug_wing_phase) * 2.0f; +float flutter_gain = 1.0f + BAT_BUG_WING_AMP_MOD * (tri - 0.5f); +body_energy *= flutter_gain; +``` + +Potential constant: + +```c +#define BAT_BUG_WING_AMP_MOD 0.30f +``` + +This is cheapest but probably less useful because the policy may see it as noise unless it can integrate over time. + +## Recommended first implementation choice + +Use the two-sideband model only if echo event capacity is safely high and current observations can represent multiple arrivals without saturation. + +Use the one-sideband model if event pressure is a concern. + +Use amplitude-only only as a fallback. + +My recommendation for first sweep: + +- One extra wing echo per bug echo. +- Triangle oscillator, no `sinf`. +- `BAT_BUG_WING_ECHO_GAIN = 0.20f`. +- `BAT_BUG_WING_FREQ_OFFSET = 0.06f`. +- Preserve body echo unchanged for the first test so the new signal is additive and easy to ablate. + +## Where it should live in Bat + +Likely location: + +- Bug echo scheduling path, near `bat_schedule_echo` or wherever `BAT_ECHO_BUG` events are created. + +Rules: + +- Do not add this to obstacle echoes. +- Do not add this to render-only or audio-only code. +- Add/advance wing phase in the core env tick/reset state if deterministic observations depend on it. +- If state serialization exists or is added later, include wing phase. +- If randomizing initial wing phase, seed it deterministically with env RNG. + +## Performance considerations + +Good: + +- Phase update once per step. +- Triangle wave instead of `sinf`. +- Constants as `#define`. +- Add at most one extra event first. +- Clamp normalized frequency with existing clamp logic. + +Avoid: + +- Per-echo trigonometry if not needed. +- FFT or convolution. +- Large target meshes. +- Per-wing geometry. +- More than one or two additional events without checking event capacity and observation saturation. + +## Expected behavior change + +Likely effects: + +- Bug echoes become more identifiable than obstacle echoes. +- The policy may learn that moving/oscillating echo structure indicates prey. +- Depending on reward and observation clipping, it may improve pursuit or just add noise. +- If event buffers saturate, it can silently hurt by dropping echoes. + +Important risk: + +- PufferLib reward clipping already caused signal issues earlier. Echo observation scaling can have a similar failure mode if new wing echoes saturate observation bins. Keep energy modest and inspect normalization before committing to a sweep. + +## Interaction with ear directivity + +Ear directivity and wing sidebands should be tested separately first. + +Reason: + +- Ear directivity changes spatial gain. +- Wing sidebands change prey identity/frequency/time structure. +- Combining them at once makes it hard to know what helped or broke. + +Order recommendation: + +1. Sweep current baseline after timer/log/audio cleanup. +2. Add static ear directivity only. +3. Add bug wing sideband only. +4. Combine only if both individual variants look viable. + +## Implementation checklist + +- Add `bug_wing_phase` to env state only if needed by deterministic core observations. +- Reset/init `bug_wing_phase` deterministically. +- Advance phase with multiplication/addition, not division. +- Add wing echo only for `BAT_ECHO_BUG`. +- Clamp wing frequency after offset. +- Ensure event capacity cannot drop important echoes. +- Keep observation normalization stable. +- Build/test/train/eval before comparing to baseline. + +## Source list + +- https://doi.org/10.1007/BF00612592 +- https://doi.org/10.1098/rspb.2003.2487 +- https://royalsocietypublishing.org/doi/10.1098/rspb.2003.2487 +- https://doi.org/10.1098/rspb.2012.2830 +- https://royalsocietypublishing.org/doi/10.1098/rspb.2012.2830 +- https://doi.org/10.1037/bne0000315 +- https://doi.apa.org/doi/10.1037/bne0000315 +- https://en.wikipedia.org/wiki/Animal_echolocation +- https://en.wikipedia.org/wiki/Doppler_shift_compensation diff --git a/ocean/bat/bat.c b/ocean/bat/bat.c new file mode 100644 index 0000000000..f0c62c51f8 --- /dev/null +++ b/ocean/bat/bat.c @@ -0,0 +1,109 @@ +#include +#include +#include +#include "bat.h" + +#define DEMO_CONFIG_PATH "config/bat.ini" + +static char* trim(char* s) { + while (isspace((unsigned char)*s)) s++; + char* end = s + strlen(s); + while (end > s && isspace((unsigned char)end[-1])) end--; + *end = '\0'; + return s; +} + +static void apply_env_config_value(Bat* env, const char* key, float value) { + if (strcmp(key, "max_speed") == 0) env->max_speed = value; + else if (strcmp(key, "min_speed") == 0) env->min_speed = value; + else if (strcmp(key, "accel") == 0) env->accel = value; + else if (strcmp(key, "turn_rate") == 0) env->turn_rate = value; + else if (strcmp(key, "render_target_fps") == 0) env->render_target_fps = (int)value; + else if (strcmp(key, "record_video") == 0) env->record_video = (int)value; + else if (strcmp(key, "record_video_fps") == 0) env->record_video_fps = (int)value; + else if (strcmp(key, "record_video_seconds") == 0) env->record_video_seconds = (int)value; + else if (strcmp(key, "record_video_audio") == 0) env->record_video_audio = (int)value; + else if (strcmp(key, "bug_echo_farther_penalty_scale") == 0) env->bug_echo_farther_penalty_scale = value; + else if (strcmp(key, "bug_echo_reward_scale") == 0) env->bug_echo_reward_scale = value; + else if (strcmp(key, "bug_wing_sideband_gain") == 0) env->bug_wing_sideband_gain = value; + else if (strcmp(key, "curriculum_initial_level") == 0) env->curriculum_initial_level = (int)value; + else if (strcmp(key, "curriculum_obstacle_step") == 0) env->curriculum_obstacle_step = (int)value; + else if (strcmp(key, "curriculum_start_bug_distance") == 0) env->curriculum_start_bug_distance = value; + else if (strcmp(key, "curriculum_successes_per_level") == 0) env->curriculum_successes_per_level = (int)value; + else if (strcmp(key, "ear_separation_scale") == 0) env->ear_separation_scale = value; + else if (strcmp(key, "ear_rear_gain") == 0) env->ear_rear_gain = value; + else if (strcmp(key, "ear_front_gain") == 0) env->ear_front_gain = value; + else if (strcmp(key, "ear_side_gain") == 0) env->ear_side_gain = value; + else if (strcmp(key, "early_chirp_penalty") == 0) env->early_chirp_penalty = value; + else if (strcmp(key, "progress_reward_scale") == 0) env->progress_reward_scale = value; + else if (strcmp(key, "reflector_strength") == 0) env->reflector_strength = value; + else if (strcmp(key, "sound_speed") == 0) env->sound_speed = value; + else if (strcmp(key, "step_cost") == 0) env->step_cost = value; + else if (strcmp(key, "valid_chirp_reward") == 0) env->valid_chirp_reward = value; + else if (strcmp(key, "chirp_cooldown_ticks") == 0) env->chirp_cooldown_ticks = (int)value; + else if (strcmp(key, "chirp_efficiency_reward") == 0) env->chirp_efficiency_reward = value; + else if (strcmp(key, "chirp_overlap_penalty") == 0) env->chirp_overlap_penalty = value; + else if (strcmp(key, "collision_penalty") == 0) env->collision_penalty = value; +} + +static void load_env_config(Bat* env, const char* path) { + FILE* file = fopen(path, "r"); + if (file == NULL) return; + + bool in_env = false; + char line[256]; + while (fgets(line, sizeof(line), file) != NULL) { + char* s = trim(line); + if (*s == '\0' || *s == '#' || *s == ';') continue; + if (*s == '[') { + in_env = strcmp(s, "[env]") == 0; + continue; + } + if (!in_env) continue; + + char* eq = strchr(s, '='); + if (eq == NULL) continue; + *eq = '\0'; + char* key = trim(s); + char* raw_value = trim(eq + 1); + apply_env_config_value(env, key, strtof(raw_value, NULL)); + } + + fclose(file); +} + +void demo() { + Bat env = { + .num_agents = NUM_AGENTS, + .render_target_fps = 60, + .record_video_fps = 30, + .record_video_seconds = 30, + .record_video_audio = 1, + }; + load_env_config(&env, DEMO_CONFIG_PATH); + env.rng = (unsigned int)time(NULL); + allocate(&env); + env.client = make_client(&env); + c_reset(&env); + + while (!WindowShouldClose()) { + memset(env.actions, 0, sizeof(float) * NUM_ACTIONS); + if (IsKeyDown(KEY_W)) env.actions[ACTION_MOVE] = THRUST_FORWARD; + if (IsKeyDown(KEY_S)) env.actions[ACTION_MOVE] = BRAKE; + if (IsKeyDown(KEY_A) || IsKeyDown(KEY_LEFT)) env.actions[ACTION_TURN] = TURN_LEFT; + if (IsKeyDown(KEY_D) || IsKeyDown(KEY_RIGHT)) env.actions[ACTION_TURN] = TURN_RIGHT; + env.actions[ACTION_CHIRP_FREQ_END] = CHIRP_FREQ_BINS - 1; + env.actions[ACTION_CHIRP_DURATION] = 1; + env.actions[ACTION_CHIRP_EMIT] = IsKeyDown(KEY_SPACE) ? 1.0f : 0.0f; + c_step(&env); + c_render(&env); + } + + close_client(env.client); + free_allocated(&env); +} + +int main() { + demo(); + return 0; +} diff --git a/ocean/bat/bat.h b/ocean/bat/bat.h new file mode 100644 index 0000000000..f32b4c9312 --- /dev/null +++ b/ocean/bat/bat.h @@ -0,0 +1,1542 @@ +#pragma once + +#include +#include +#include +#include +#include + +#ifndef BAT_HEADLESS +#include "raylib.h" +#endif + +#define NUM_AGENTS 1 +#define NUM_ACTIONS 6 +#define ACTION_MOVE 0 +#define ACTION_TURN 1 +#define ACTION_CHIRP_FREQ_START 2 +#define ACTION_CHIRP_FREQ_END 3 +#define ACTION_CHIRP_DURATION 4 +#define ACTION_CHIRP_EMIT 5 +#define MOVE_ACTIONS 3 +#define TURN_ACTIONS 3 +#define CHIRP_FREQ_BINS 8 +#define CHIRP_DURATION_BINS 4 +#define CHIRP_EMIT_ACTIONS 2 + +#define FREQ_BINS 16 +#define LEFT_FREQ_OFFSET 0 +#define RIGHT_FREQ_OFFSET FREQ_BINS +#define CHIRP_AGE_OBS (RIGHT_FREQ_OFFSET + FREQ_BINS) +#define CHIRP_COOLDOWN_OBS (CHIRP_AGE_OBS + 1) +#define CHIRP_START_OBS (CHIRP_COOLDOWN_OBS + 1) +#define CHIRP_END_OBS (CHIRP_START_OBS + 1) +#define CHIRP_DURATION_OBS (CHIRP_END_OBS + 1) +#define CHIRPS_USED_OBS (CHIRP_DURATION_OBS + 1) +#define FORWARD_SPEED_OBS (CHIRPS_USED_OBS + 1) +#define TURN_RATE_OBS (FORWARD_SPEED_OBS + 1) +#define TIMER_OBS (TURN_RATE_OBS + 1) +#define OBS_SIZE (TIMER_OBS + 1) + +#define NOOP 0 +#define THRUST_FORWARD 1 +#define BRAKE 2 + +#define TURN_NONE 0 +#define TURN_LEFT 1 +#define TURN_RIGHT 2 + +#define MAX_STEPS 512 +#define TICK_RATE (1.0f/60.0f) +#define ARENA_WIDTH 64 +#define ARENA_HEIGHT 64 +#define AGENT_RADIUS 2.0f +#define BUG_RADIUS 1.5f +#define SPAWN_MARGIN 6.0f +#define BUG_SPEED 4.0f +#define BUG_MANEUVER_START_LEVEL 7 +#define BUG_MANEUVER_STRENGTH 0.4f +#define BUG_MANEUVER_FREQUENCY 0.4f +#define INBOUND_BUG_SPEED_MULTIPLIER 1.75f +#define INBOUND_HEADING_NOISE_DEGREES 18.0f +#define REFLECTOR_SPACING 8.0f +#define MAX_ECHO_RANGE 128.0f +#define ECHO_MIN_FORWARD -0.35f +#define BUG_ECHO_MIN_DISPLACEMENT 1.0f +#define CURRICULUM_MAX_OBSTACLES 3 +#define CURRICULUM_BUG_DISTANCE_STEP 2.0f +#define CURRICULUM_MAX_BUG_DISTANCE 40.0f +#define CURRICULUM_INBOUND_START_LEVEL 8 +#define CURRICULUM_INBOUND_MAX_BUG_DISTANCE 56.0f +#define CURRICULUM_INBOUND_BUG_DISTANCE_STEP 4.0f +#define PI_F 3.14159265358979323846f +#define TWO_PI (2.0f * PI_F) +#define CHIRP_HISTORY 4 +#define CHIRP_RINGS 5 +#define MAX_CHIRP_SLICES 16 +#define ECHO_QUEUE_TICKS 256 +#define AUDIO_VOICES 8 +#define AUDIO_SAMPLE_RATE 48000 +#define AUDIO_MIN_HZ 600.0f +#define AUDIO_MAX_HZ 3600.0f +#define AUDIO_VOLUME 0.22f +#define AUDIO_ENVELOPE_FADE 0.08f +#define RECORD_MAX_VOICES 16 +#define FREQ_HISTORY_TICKS 96 +#define FREQ_PANEL_WIDTH 384 +#define FREQ_WATERFALL_WIDTH 192 +#define FREQ_PANEL_MARGIN 8 +#define CHIRP_PERF_FLOOR 0.05f +#define CHIRP_MIN_DURATION_SECONDS 0.04f +#define CHIRP_DURATION_RANGE_SECONDS 0.18f +#define MAX_CHIRPS_PER_EPISODE 15 + +#define ECHO_STATIC 0 +#define ECHO_BUG 1 +#define ARENA_REFLECTORS 8 + +static const float ARENA_REFLECTOR_X[ARENA_REFLECTORS] = {0.0f, 1.0f, 0.0f, 1.0f, 0.5f, 0.5f, 0.0f, 1.0f}; +static const float ARENA_REFLECTOR_Y[ARENA_REFLECTORS] = {0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f, 0.5f, 0.5f}; + +typedef struct ChirpEvent { + float x; + float y; + float source_x[MAX_CHIRP_SLICES]; + float source_y[MAX_CHIRP_SLICES]; + float start_freq; + float end_freq; + float duration; + int birth_tick; + int slice_count; + int slices_scheduled; + int active; +} ChirpEvent; + +typedef struct EchoBucket { + float energy[2][FREQ_BINS]; + float closest_bug_echo_path; + int tick; +} EchoBucket; + +typedef struct BatRecordVoice { + int active; + int start_sample; + float start_freq; + float end_freq; + float duration; +} BatRecordVoice; + +typedef struct Log { + float perf; + float base_perf; + float score; + float episode_return; + float episode_length; + float collision; + float timeout; + float curriculum_level; + float curriculum_difficulty; + float curriculum_perf; + float num_obstacles; + float chirps_emitted; + float chirp_perf; + float n; +} Log; + +typedef struct Client { + int width; + int height; +#ifndef BAT_HEADLESS + int audio_ready; + int last_audio_chirp_serial; + int audio_voice_cursor; + Sound chirp_sounds[AUDIO_VOICES]; + int chirp_sound_loaded[AUDIO_VOICES]; + int recording_initialized; + int recording_finalized; + int record_frame; + int record_max_frames; + int record_fps; + int record_audio; + int record_last_audio_chirp_serial; + int record_audio_sample_cursor; + int record_audio_data_bytes; + int record_voice_cursor; + FILE* record_wav; + char record_frame_dir[256]; + char record_wav_path[256]; + char record_mp4_path[256]; + BatRecordVoice record_voices[RECORD_MAX_VOICES]; + float freq_history[FREQ_HISTORY_TICKS][2][FREQ_BINS]; + int freq_history_head; + int freq_history_last_tick; +#endif +} Client; + +typedef enum ChirpStatus { + CHIRP_STATUS_OVER_BUDGET = -2, + CHIRP_STATUS_COOLDOWN = -1, + CHIRP_STATUS_NONE = 0, + CHIRP_STATUS_EMITTED = 1, +} ChirpStatus; + +typedef struct Bat { + Client* client; + Log log; + float* observations; + float* actions; + float* rewards; + float* terminals; + int num_agents; + + int tick; + int render_target_fps; + int record_video; + int record_video_fps; + int record_video_seconds; + int record_video_audio; + int num_obstacles; + int curriculum_level; + int curriculum_initial_level; + int curriculum_obstacle_step; + int curriculum_successes_per_level; + int curriculum_successes_at_level; + float curriculum_start_bug_distance; + + float x; + float y; + float vx; + float vy; + float heading; + float turn_velocity; + float ear_separation_scale; + float ear_rear_gain; + float ear_front_gain; + float ear_side_gain; + float max_speed; + float min_speed; + float accel; + float turn_rate; + + float bug_x; + float bug_y; + float bug_vx; + float bug_vy; + int bug_inbound; + int bug_maneuver_mode; + float bug_base_heading; + float bug_maneuver_phase; + float bug_maneuver_rate; + float bug_maneuver_sign; + + float obstacle_x[CURRICULUM_MAX_OBSTACLES]; + float obstacle_y[CURRICULUM_MAX_OBSTACLES]; + float obstacle_w[CURRICULUM_MAX_OBSTACLES]; + float obstacle_h[CURRICULUM_MAX_OBSTACLES]; + + float sound_speed; + float reflector_strength; + int chirp_cooldown_ticks; + int last_chirp_tick; + float last_chirp_start_freq; + float last_chirp_end_freq; + float last_chirp_duration; + ChirpEvent chirps[CHIRP_HISTORY]; + int chirp_head; + EchoBucket echo_queue[ECHO_QUEUE_TICKS]; + int chirps_emitted; + int audio_chirp_serial; + + float chirp_efficiency_reward; + float valid_chirp_reward; + float early_chirp_penalty; + float chirp_overlap_penalty; + float step_cost; + float progress_reward_scale; + float bug_echo_reward_scale; + float bug_echo_farther_penalty_scale; + float bug_wing_sideband_gain; + float tick_bug_echo_path; + float last_bug_echo_path; + float last_bug_echo_expected_tick; + float last_bug_echo_x; + float last_bug_echo_y; + float collision_penalty; + float prev_bug_dist; + float start_bug_dist; + float episode_return; + + unsigned int rng; +} Bat; + +static inline unsigned int rng_next(Bat* env) { + env->rng = env->rng * 1664525u + 1013904223u; + return env->rng; +} + +static inline float randf(Bat* env) { + return (rng_next(env) >> 8) * (1.0f / 16777216.0f); +} + +static inline float bat_clampf(float v, float lo, float hi) { + if (v < lo) return lo; + if (v > hi) return hi; + return v; +} + +static inline float chirp_duration_seconds(float duration_norm) { + return CHIRP_MIN_DURATION_SECONDS + CHIRP_DURATION_RANGE_SECONDS * duration_norm; +} + +#include "bat_audio.h" + +static inline float chirp_slice_ticks(ChirpEvent* chirp, int slice_idx) { + return ((slice_idx + 0.5f) / (float)chirp->slice_count) * + chirp->duration / TICK_RATE; +} + +static inline void chirp_source_for_slice(ChirpEvent* chirp, int slice_idx, + float* source_x, float* source_y) { + if (slice_idx < chirp->slices_scheduled) { + *source_x = chirp->source_x[slice_idx]; + *source_y = chirp->source_y[slice_idx]; + return; + } + *source_x = chirp->x; + *source_y = chirp->y; +} + +static inline float chirp_age_norm_denominator(Bat* env) { + float travel_ticks = MAX_ECHO_RANGE / env->sound_speed / TICK_RATE; + float chirp_ticks = chirp_duration_seconds(1.0f) / TICK_RATE; + return 1.25f * (travel_ticks + chirp_ticks); +} + +static inline float norm_bin(int idx, int count) { + return idx / (float)(count - 1); +} + +static inline float dist(float ax, float ay, float bx, float by) { + float dx = bx - ax; + float dy = by - ay; + return sqrtf(dx*dx + dy*dy); +} + +static inline void norm_vec(float x, float y, float* ox, float* oy) { + float l = sqrtf(x*x + y*y); + if (l <= 0.000001f) { + *ox = 1.0f; + *oy = 0.0f; + return; + } + *ox = x / l; + *oy = y / l; +} + +static inline bool circle_rect_collision(float cx, float cy, float r, + float rx, float ry, float rw, float rh) { + float px = bat_clampf(cx, rx, rx + rw); + float py = bat_clampf(cy, ry, ry + rh); + return dist(cx, cy, px, py) <= r; +} + +static inline bool rects_overlap(float ax, float ay, float aw, float ah, + float bx, float by, float bw, float bh, float margin) { + return ax - margin < bx + bw && + ax + aw + margin > bx && + ay - margin < by + bh && + ay + ah + margin > by; +} + +static inline void sample_in_quadrant(Bat* env, int quadrant, float* x, float* y) { + int east = quadrant & 1; + int south = (quadrant >> 1) & 1; + float half_w = ARENA_WIDTH * 0.5f; + float half_h = ARENA_HEIGHT * 0.5f; + float min_x = (east ? half_w : 0.0f) + SPAWN_MARGIN; + float max_x = (east ? (float)ARENA_WIDTH : half_w) - SPAWN_MARGIN; + float min_y = (south ? half_h : 0.0f) + SPAWN_MARGIN; + float max_y = (south ? (float)ARENA_HEIGHT : half_h) - SPAWN_MARGIN; + *x = min_x + randf(env) * (max_x - min_x); + *y = min_y + randf(env) * (max_y - min_y); +} + +static inline int curriculum_obstacles(Bat* env) { + int count = env->curriculum_level > 0 + ? 1 + (env->curriculum_level - 1) / env->curriculum_obstacle_step : 0; + return count > CURRICULUM_MAX_OBSTACLES ? CURRICULUM_MAX_OBSTACLES : count; +} + +static inline float curriculum_bug_distance(Bat* env) { + return bat_clampf(env->curriculum_start_bug_distance + + CURRICULUM_BUG_DISTANCE_STEP * env->curriculum_level, + env->curriculum_start_bug_distance, + CURRICULUM_MAX_BUG_DISTANCE); +} + +static inline float curriculum_inbound_bug_distance(Bat* env) { + return bat_clampf(CURRICULUM_MAX_BUG_DISTANCE + + CURRICULUM_INBOUND_BUG_DISTANCE_STEP + * (env->curriculum_level - CURRICULUM_INBOUND_START_LEVEL + 1), + CURRICULUM_MAX_BUG_DISTANCE, CURRICULUM_INBOUND_MAX_BUG_DISTANCE); +} + +static inline float curriculum_bug_maneuver_strength(Bat* env) { + if (env->curriculum_level < BUG_MANEUVER_START_LEVEL) return 0.0f; + int extra_levels = env->curriculum_level - BUG_MANEUVER_START_LEVEL; + float ramp = extra_levels <= 0 ? 0.25f : 0.75f + 0.25f * (extra_levels - 1); + return BUG_MANEUVER_STRENGTH * bat_clampf(ramp, 0.0f, 1.0f); +} + +// TODO: When we are ready to break determinism, simplify bug maneuvering to one +// always-active sine wave with curriculum-ramped amplitude, then remove the mode +// and sign branches below. +static inline float curriculum_bug_maneuver_frequency(Bat* env) { + if (env->curriculum_level < BUG_MANEUVER_START_LEVEL) { + return BUG_MANEUVER_FREQUENCY; + } + return BUG_MANEUVER_FREQUENCY * bat_clampf( + 1.0f + 0.50f * (env->curriculum_level - BUG_MANEUVER_START_LEVEL), + 1.0f, 2.5f); +} + +static inline float chirps_used_ratio(Bat* env) { + return bat_clampf(env->chirps_emitted / (float)MAX_CHIRPS_PER_EPISODE, 0.0f, 1.0f); +} + +// TODO: Revisit this when we are ready to break reward determinism. The ratio is +// still an observation, but this reward bonus may be removable before merge. +static inline float chirp_efficiency(Bat* env) { + return 0.5f + 0.5f * (1.0f - chirps_used_ratio(env)); +} + +static inline float chirp_perf(Bat* env) { + return bat_clampf(1.0f - env->chirps_emitted / (float)MAX_CHIRPS_PER_EPISODE, + CHIRP_PERF_FLOOR, 1.0f); +} + +// TODO: Revisit whether these curriculum difficulty diagnostics are worth logging; +// they add a lot of code and may be removable before merge. +static inline float curriculum_distance_difficulty(Bat* env) { + return bat_clampf((env->start_bug_dist - env->curriculum_start_bug_distance) + / (CURRICULUM_INBOUND_MAX_BUG_DISTANCE - env->curriculum_start_bug_distance), + 0.0f, 1.0f); +} + +static inline float curriculum_obstacle_difficulty(Bat* env) { + return bat_clampf(env->num_obstacles / (float)CURRICULUM_MAX_OBSTACLES, 0.0f, 1.0f); +} + +static inline float curriculum_motion_difficulty(Bat* env) { + if (env->curriculum_level < BUG_MANEUVER_START_LEVEL) return 0.0f; + return bat_clampf((env->curriculum_level - BUG_MANEUVER_START_LEVEL + 1) + / (float)(CURRICULUM_INBOUND_START_LEVEL + 4 - BUG_MANEUVER_START_LEVEL), + 0.0f, 1.0f); +} + +static inline float curriculum_difficulty(Bat* env) { + return bat_clampf((curriculum_distance_difficulty(env) + + curriculum_obstacle_difficulty(env) + + curriculum_motion_difficulty(env)) / 3.0f, 0.0f, 1.0f); +} + +static inline void sample_spawns_at_distance(Bat* env, float target_distance) { + for (int attempt = 0; attempt < 96; attempt++) { + float angle = randf(env) * TWO_PI - PI_F; + float dx = cosf(angle) * target_distance; + float dy = sinf(angle) * target_distance; + float min_bat_x = fmaxf(SPAWN_MARGIN, SPAWN_MARGIN - dx); + float max_bat_x = fminf(ARENA_WIDTH - SPAWN_MARGIN, ARENA_WIDTH - SPAWN_MARGIN - dx); + float min_bat_y = fmaxf(SPAWN_MARGIN, SPAWN_MARGIN - dy); + float max_bat_y = fminf(ARENA_HEIGHT - SPAWN_MARGIN, ARENA_HEIGHT - SPAWN_MARGIN - dy); + if (max_bat_x < min_bat_x || max_bat_y < min_bat_y) continue; + + env->x = min_bat_x + randf(env) * (max_bat_x - min_bat_x); + env->y = min_bat_y + randf(env) * (max_bat_y - min_bat_y); + env->bug_x = env->x + dx; + env->bug_y = env->y + dy; + return; + } + + int agent_quadrant = (int)(randf(env) * 4.0f); + int bug_quadrant = agent_quadrant ^ 3; + float min_sep = fminf(ARENA_WIDTH, ARENA_HEIGHT) * 0.31f; + for (int attempt = 0; attempt < 64; attempt++) { + sample_in_quadrant(env, agent_quadrant, &env->x, &env->y); + sample_in_quadrant(env, bug_quadrant, &env->bug_x, &env->bug_y); + if (dist(env->x, env->y, env->bug_x, env->bug_y) >= min_sep) { + return; + } + } + + env->x = ARENA_WIDTH * ((agent_quadrant & 1) ? 0.75f : 0.25f); + env->y = ARENA_HEIGHT * ((agent_quadrant & 2) ? 0.75f : 0.25f); + env->bug_x = ARENA_WIDTH * ((bug_quadrant & 1) ? 0.75f : 0.25f); + env->bug_y = ARENA_HEIGHT * ((bug_quadrant & 2) ? 0.75f : 0.25f); +} + +static inline void reset_bug_motion(Bat* env) { + float strength = curriculum_bug_maneuver_strength(env); + env->bug_maneuver_mode = strength > 0.000001f ? 1 + (int)(rng_next(env) % 3u) : 0; + env->bug_maneuver_phase = randf(env) * TWO_PI; + env->bug_maneuver_rate = TWO_PI * curriculum_bug_maneuver_frequency(env) * + (0.75f + 0.50f * randf(env)); + env->bug_maneuver_sign = (rng_next(env) & 1u) ? -1.0f : 1.0f; + + float speed = env->bug_inbound ? BUG_SPEED * INBOUND_BUG_SPEED_MULTIPLIER : BUG_SPEED; + float heading; + if (env->bug_inbound) { + float tx, ty; + norm_vec(env->x - env->bug_x, env->y - env->bug_y, &tx, &ty); + float noise = INBOUND_HEADING_NOISE_DEGREES * (PI_F / 180.0f); + heading = atan2f(ty, tx) + (2.0f * randf(env) - 1.0f) * noise; + } else { + heading = randf(env) * TWO_PI - PI_F; + } + env->bug_base_heading = heading; + env->bug_vx = cosf(heading) * speed; + env->bug_vy = sinf(heading) * speed; +} + +static inline void advance_curriculum(Bat* env) { + env->curriculum_successes_at_level += 1; + if (env->curriculum_successes_at_level >= env->curriculum_successes_per_level) { + env->curriculum_level += 1; + env->curriculum_successes_at_level = 0; + } +} + +// TODO: Revisit this when we are ready to break reset determinism. If overlapping +// random obstacles are acceptable, remove rects_overlap(), obstacle_clear(), and +// the attempt loop/fallback placement in generate_obstacles(). +static inline bool obstacle_clear(Bat* env, int idx, float x, float y, + float w, float h) { + if (circle_rect_collision(env->x, env->y, AGENT_RADIUS + 2.0f, x, y, w, h)) { + return false; + } + if (circle_rect_collision(env->bug_x, env->bug_y, BUG_RADIUS + 2.0f, x, y, w, h)) { + return false; + } + for (int j = 0; j < idx; j++) { + if (rects_overlap(x, y, w, h, + env->obstacle_x[j], env->obstacle_y[j], env->obstacle_w[j], env->obstacle_h[j], 3.0f)) { + return false; + } + } + return true; +} + +static inline void generate_obstacles(Bat* env) { + for (int i = 0; i < env->num_obstacles; i++) { + bool placed = false; + for (int attempt = 0; attempt < 96; attempt++) { + float w = 3.0f + 5.0f * randf(env); + float h = 3.0f + 5.0f * randf(env); + float margin = 4.0f; + float x = margin + randf(env) * (ARENA_WIDTH - w - 2.0f * margin); + float y = margin + randf(env) * (ARENA_HEIGHT - h - 2.0f * margin); + if (obstacle_clear(env, i, x, y, w, h)) { + env->obstacle_x[i] = x; + env->obstacle_y[i] = y; + env->obstacle_w[i] = w; + env->obstacle_h[i] = h; + placed = true; + break; + } + } + if (!placed) { + float w = 6.0f; + float h = 6.0f; + float x = ARENA_WIDTH * (0.30f + 0.20f * (i % 2)) - w * 0.5f; + float y = ARENA_HEIGHT * (0.30f + 0.20f * ((i + 1) % 2)) - h * 0.5f; + env->obstacle_x[i] = x; + env->obstacle_y[i] = y; + env->obstacle_w[i] = w; + env->obstacle_h[i] = h; + } + } +} + +void init(Bat* env) { + env->curriculum_level = env->curriculum_initial_level; + env->curriculum_successes_at_level = 0; +} + +void allocate(Bat* env) { + init(env); + env->observations = (float*)calloc(OBS_SIZE, sizeof(float)); + env->actions = (float*)calloc(NUM_ACTIONS, sizeof(float)); + env->rewards = (float*)calloc(1, sizeof(float)); + env->terminals = (float*)calloc(1, sizeof(float)); +} + +void c_close(Bat* env) { + (void)env; +} + +void free_allocated(Bat* env) { + free(env->actions); + free(env->observations); + free(env->terminals); + free(env->rewards); + c_close(env); +} + +static inline void add_log(Bat* env, float success, float collision, float timeout) { + float curriculum_difficulty_value = curriculum_difficulty(env); + float chirp_perf_value = chirp_perf(env); + env->log.perf += success * curriculum_difficulty_value * chirp_perf_value; + env->log.base_perf += success; + env->log.score += env->episode_return; + env->log.episode_return += env->episode_return; + env->log.episode_length += env->tick; + env->log.collision += collision; + env->log.timeout += timeout; + env->log.curriculum_level += env->curriculum_level; + env->log.curriculum_difficulty += curriculum_difficulty_value; + env->log.curriculum_perf += success * curriculum_difficulty_value; + env->log.num_obstacles += env->num_obstacles; + env->log.chirps_emitted += env->chirps_emitted; + env->log.chirp_perf += chirp_perf_value; + env->log.n += 1.0f; +} + +static inline void clear_echo_bucket(EchoBucket* bucket) { + memset(bucket, 0, sizeof(*bucket)); + bucket->closest_bug_echo_path = -1.0f; + bucket->tick = -1; +} + +static inline void clear_echo_queue(Bat* env) { + for (int i = 0; i < ECHO_QUEUE_TICKS; i++) { + clear_echo_bucket(&env->echo_queue[i]); + } +} + +static inline void add_echo_event(Bat* env, int ear, float receive_tick, + float freq, float intensity, float path, int source) { + if (receive_tick <= env->tick) return; + if (intensity <= 0.000001f) return; + int arrival_tick = (int)ceilf(receive_tick); + if (arrival_tick - env->tick >= ECHO_QUEUE_TICKS) return; + int slot = arrival_tick % ECHO_QUEUE_TICKS; + EchoBucket* bucket = &env->echo_queue[slot]; + if (bucket->tick != arrival_tick) { + clear_echo_bucket(bucket); + bucket->tick = arrival_tick; + } + + int bin = (int)(freq * FREQ_BINS); + if (bin >= FREQ_BINS) bin = FREQ_BINS - 1; + bucket->energy[ear][bin] += intensity; + if (source == ECHO_BUG) { + float sideband = intensity * env->bug_wing_sideband_gain; + if (sideband > 0.000001f) { + if (bin > 0) bucket->energy[ear][bin - 1] += sideband; + if (bin + 1 < FREQ_BINS) bucket->energy[ear][bin + 1] += sideband; + } + if (bucket->closest_bug_echo_path < 0.0f || path < bucket->closest_bug_echo_path) { + bucket->closest_bug_echo_path = path; + } + } +} + +static inline void ear_positions(Bat* env, float* left_x, float* left_y, + float* right_x, float* right_y) { + float lx = -sinf(env->heading); + float ly = cosf(env->heading); + float ear_sep = AGENT_RADIUS * env->ear_separation_scale; + *left_x = env->x - lx * ear_sep * 0.5f; + *left_y = env->y - ly * ear_sep * 0.5f; + *right_x = env->x + lx * ear_sep * 0.5f; + *right_y = env->y + ly * ear_sep * 0.5f; +} + +static inline void schedule_ear_echo(Bat* env, int birth_tick, int ear, + float slice_ticks, float freq, float strength, float path, + float gain, int source) { + if (path > MAX_ECHO_RANGE) return; + float attenuation = strength / (1.0f + 0.02f * path * path); + float receive_tick = birth_tick + slice_ticks + path / env->sound_speed / TICK_RATE; + add_echo_event(env, ear, receive_tick, freq, attenuation * gain, path, source); +} + +static inline float expected_bug_echo_tick(Bat* env, ChirpEvent* chirp) { + float fx = cosf(env->heading); + float fy = sinf(env->heading); + float ux, uy; + norm_vec(env->bug_x - chirp->x, env->bug_y - chirp->y, &ux, &uy); + float forward = ux * fx + uy * fy; + if (forward < ECHO_MIN_FORWARD) return -1.0f; + + float left_ear_x, left_ear_y, right_ear_x, right_ear_y; + ear_positions(env, &left_ear_x, &left_ear_y, &right_ear_x, &right_ear_y); + float source_path = dist(chirp->x, chirp->y, env->bug_x, env->bug_y); + float left_path = source_path + dist(env->bug_x, env->bug_y, left_ear_x, left_ear_y); + float right_path = source_path + dist(env->bug_x, env->bug_y, right_ear_x, right_ear_y); + float best_path = fminf(left_path, right_path); + if (best_path > MAX_ECHO_RANGE) return -1.0f; + + return chirp->birth_tick + chirp_slice_ticks(chirp, 0) + + best_path / env->sound_speed / TICK_RATE; +} + +static inline void schedule_echo(Bat* env, ChirpEvent* chirp, + float slice_ticks, float freq, float rx, float ry, float rvx, float rvy, + float strength, int source) { + float fx = cosf(env->heading); + float fy = sinf(env->heading); + float lateral_x = -fy; + float lateral_y = fx; + float left_ear_x, left_ear_y, right_ear_x, right_ear_y; + ear_positions(env, &left_ear_x, &left_ear_y, &right_ear_x, &right_ear_y); + + float ux, uy; + norm_vec(rx - chirp->x, ry - chirp->y, &ux, &uy); + float forward = ux * fx + uy * fy; + if (forward < ECHO_MIN_FORWARD) return; + + float front_gain = bat_clampf(forward, 0.0f, 1.0f); + float left_side_gain = bat_clampf(ux * -lateral_x + uy * -lateral_y, 0.0f, 1.0f); + float right_side_gain = bat_clampf(ux * lateral_x + uy * lateral_y, 0.0f, 1.0f); + front_gain *= front_gain; + left_side_gain *= left_side_gain; + right_side_gain *= right_side_gain; + float left_gain = env->ear_rear_gain + env->ear_front_gain * front_gain + + env->ear_side_gain * left_side_gain; + float right_gain = env->ear_rear_gain + env->ear_front_gain * front_gain + + env->ear_side_gain * right_side_gain; + + float source_path = dist(chirp->x, chirp->y, rx, ry); + float left_path = source_path + dist(rx, ry, left_ear_x, left_ear_y); + float right_path = source_path + dist(rx, ry, right_ear_x, right_ear_y); + if (left_path > MAX_ECHO_RANGE && right_path > MAX_ECHO_RANGE) return; + + float rel_vx = rvx - env->vx; + float rel_vy = rvy - env->vy; + float distance_rate = rel_vx * ux + rel_vy * uy; + float doppler = bat_clampf(-distance_rate / (env->max_speed + BUG_SPEED), -1.0f, 1.0f); + float shifted_freq = bat_clampf(freq + 0.20f * doppler, 0.0f, 1.0f); + + schedule_ear_echo(env, chirp->birth_tick, 0, + slice_ticks, shifted_freq, strength, left_path, left_gain, source); + schedule_ear_echo(env, chirp->birth_tick, 1, + slice_ticks, shifted_freq, strength, right_path, right_gain, source); +} + +static inline void schedule_segment_reflectors(Bat* env, ChirpEvent* chirp, + float slice_ticks, float freq, float x1, float y1, float x2, float y2, + float strength) { + float len = dist(x1, y1, x2, y2); + int count = (int)(len / REFLECTOR_SPACING) + 1; + for (int i = 0; i <= count; i++) { + float t = i / (float)count; + float x = x1 + (x2 - x1) * t; + float y = y1 + (y2 - y1) * t; + schedule_echo(env, chirp, slice_ticks, freq, x, y, 0.0f, 0.0f, strength, ECHO_STATIC); + } +} + +static inline void schedule_corner_reflector_echoes(Bat* env, ChirpEvent* chirp, + float slice_ticks, float freq) { + float w = (float)ARENA_WIDTH; + float h = (float)ARENA_HEIGHT; + for (int i = 0; i < ARENA_REFLECTORS; i++) { + schedule_echo(env, chirp, slice_ticks, freq, + ARENA_REFLECTOR_X[i] * w, ARENA_REFLECTOR_Y[i] * h, + 0.0f, 0.0f, env->reflector_strength, ECHO_STATIC); + } +} + +static inline void schedule_obstacle_echoes(Bat* env, ChirpEvent* chirp, + float slice_ticks, float freq, int i) { + float x = env->obstacle_x[i]; + float y = env->obstacle_y[i]; + float w = env->obstacle_w[i]; + float h = env->obstacle_h[i]; + schedule_segment_reflectors(env, chirp, slice_ticks, freq, x, y, x + w, y, 0.55f); + schedule_segment_reflectors(env, chirp, slice_ticks, freq, x, y + h, x + w, y + h, 0.55f); + schedule_segment_reflectors(env, chirp, slice_ticks, freq, x, y, x, y + h, 0.55f); + schedule_segment_reflectors(env, chirp, slice_ticks, freq, x + w, y, x + w, y + h, 0.55f); +} + +static inline void schedule_chirp_slice_echoes(Bat* env, ChirpEvent* chirp, + int slice_idx) { + if (slice_idx >= chirp->slice_count) { + return; + } + + float t = (slice_idx + 0.5f) / (float)chirp->slice_count; + float slice_ticks = chirp_slice_ticks(chirp, slice_idx); + float freq = chirp->start_freq + t * (chirp->end_freq - chirp->start_freq); + + ChirpEvent slice_chirp = { + .x = env->x, + .y = env->y, + .birth_tick = chirp->birth_tick, + }; + chirp->source_x[slice_idx] = slice_chirp.x; + chirp->source_y[slice_idx] = slice_chirp.y; + + schedule_echo(env, &slice_chirp, slice_ticks, freq, + env->bug_x, env->bug_y, env->bug_vx, env->bug_vy, 8.0f, ECHO_BUG); + schedule_segment_reflectors(env, &slice_chirp, slice_ticks, freq, + 0.0f, 0.0f, (float)ARENA_WIDTH, 0.0f, 0.12f); + schedule_segment_reflectors(env, &slice_chirp, slice_ticks, freq, + 0.0f, (float)ARENA_HEIGHT, (float)ARENA_WIDTH, (float)ARENA_HEIGHT, 0.12f); + schedule_segment_reflectors(env, &slice_chirp, slice_ticks, freq, + 0.0f, 0.0f, 0.0f, (float)ARENA_HEIGHT, 0.12f); + schedule_segment_reflectors(env, &slice_chirp, slice_ticks, freq, + (float)ARENA_WIDTH, 0.0f, (float)ARENA_WIDTH, (float)ARENA_HEIGHT, 0.12f); + schedule_corner_reflector_echoes(env, &slice_chirp, slice_ticks, freq); + for (int j = 0; j < env->num_obstacles; j++) { + schedule_obstacle_echoes(env, &slice_chirp, slice_ticks, freq, j); + } +} + +static inline void schedule_due_chirp_slices(Bat* env) { + for (int i = 0; i < CHIRP_HISTORY; i++) { + ChirpEvent* chirp = &env->chirps[i]; + if (!chirp->active) continue; + + float age_ticks = (float)(env->tick - chirp->birth_tick); + while (chirp->slices_scheduled < chirp->slice_count) { + int slice_idx = chirp->slices_scheduled; + float slice_ticks = chirp_slice_ticks(chirp, slice_idx); + if (slice_ticks >= age_ticks + 1.0f) break; + schedule_chirp_slice_echoes(env, chirp, slice_idx); + chirp->slices_scheduled += 1; + } + } +} + +void compute_observations(Bat* env) { + memset(env->observations, 0, OBS_SIZE * sizeof(float)); + env->tick_bug_echo_path = -1.0f; + + int slot = env->tick % ECHO_QUEUE_TICKS; + EchoBucket* bucket = &env->echo_queue[slot]; + if (bucket->tick == env->tick) { + for (int i = 0; i < FREQ_BINS; i++) { + env->observations[LEFT_FREQ_OFFSET + i] = bat_clampf(bucket->energy[0][i], 0.0f, 1.0f); + env->observations[RIGHT_FREQ_OFFSET + i] = bat_clampf(bucket->energy[1][i], 0.0f, 1.0f); + } + if (bucket->closest_bug_echo_path >= 0.0f) { + env->tick_bug_echo_path = bucket->closest_bug_echo_path; + } + clear_echo_bucket(bucket); + } + + float chirp_age_denom = chirp_age_norm_denominator(env); + int chirp_age = env->tick - env->last_chirp_tick; + if (env->last_chirp_tick < 0) chirp_age = (int)ceilf(chirp_age_denom); + int cooldown = env->chirp_cooldown_ticks - (env->tick - env->last_chirp_tick); + env->observations[CHIRP_AGE_OBS] = bat_clampf(chirp_age / chirp_age_denom, 0.0f, 1.0f); + env->observations[CHIRP_COOLDOWN_OBS] = bat_clampf(cooldown / (float)env->chirp_cooldown_ticks, 0.0f, 1.0f); + env->observations[CHIRP_START_OBS] = env->last_chirp_start_freq; + env->observations[CHIRP_END_OBS] = env->last_chirp_end_freq; + env->observations[CHIRP_DURATION_OBS] = env->last_chirp_duration; + env->observations[CHIRPS_USED_OBS] = chirps_used_ratio(env); + float fwd_speed = env->vx * cosf(env->heading) + env->vy * sinf(env->heading); + env->observations[FORWARD_SPEED_OBS] = bat_clampf(fwd_speed / env->max_speed, 0.0f, 1.0f); + env->observations[TURN_RATE_OBS] = bat_clampf(env->turn_velocity / env->turn_rate, -1.0f, 1.0f); + env->observations[TIMER_OBS] = bat_clampf(env->tick / (float)MAX_STEPS, 0.0f, 1.0f); +} + +static inline void reset_episode(Bat* env) { + env->tick = 0; + env->turn_velocity = 0.0f; + env->heading = randf(env) * TWO_PI - PI_F; + env->vx = cosf(env->heading) * env->min_speed; + env->vy = sinf(env->heading) * env->min_speed; + if (env->curriculum_level < env->curriculum_initial_level) { + env->curriculum_level = env->curriculum_initial_level; + } + env->num_obstacles = curriculum_obstacles(env); + env->bug_inbound = env->curriculum_level >= CURRICULUM_INBOUND_START_LEVEL; + sample_spawns_at_distance(env, env->bug_inbound + ? curriculum_inbound_bug_distance(env) + : curriculum_bug_distance(env)); + generate_obstacles(env); + reset_bug_motion(env); + // TODO: Revisit these first-observation defaults when we are ready to break determinism. + env->last_chirp_start_freq = 0.0f; + env->last_chirp_end_freq = 1.0f; + env->last_chirp_duration = 0.33333334f; + env->last_chirp_tick = -env->chirp_cooldown_ticks; + memset(env->chirps, 0, sizeof(env->chirps)); + env->chirp_head = 0; + clear_echo_queue(env); + env->tick_bug_echo_path = -1.0f; + env->last_bug_echo_path = -1.0f; + env->last_bug_echo_expected_tick = -1.0f; + env->chirps_emitted = 0; + env->episode_return = 0.0f; + env->start_bug_dist = dist(env->x, env->y, env->bug_x, env->bug_y); + env->prev_bug_dist = env->start_bug_dist; + env->last_bug_echo_x = env->x; + env->last_bug_echo_y = env->y; + compute_observations(env); +} + +void c_reset(Bat* env) { + env->rewards[0] = 0.0f; + env->terminals[0] = 0.0f; + reset_episode(env); +} + +static inline bool hits_obstacle(Bat* env) { + for (int i = 0; i < env->num_obstacles; i++) { + if (circle_rect_collision(env->x, env->y, AGENT_RADIUS, + env->obstacle_x[i], env->obstacle_y[i], env->obstacle_w[i], env->obstacle_h[i])) { + return true; + } + } + return false; +} + +static inline bool hits_wall(Bat* env) { + return env->x - AGENT_RADIUS < 0.0f || + env->x + AGENT_RADIUS > ARENA_WIDTH || + env->y - AGENT_RADIUS < 0.0f || + env->y + AGENT_RADIUS > ARENA_HEIGHT; +} + +static inline void update_bug(Bat* env, float dt) { + float speed = env->bug_inbound ? BUG_SPEED * INBOUND_BUG_SPEED_MULTIPLIER : BUG_SPEED; + float strength = curriculum_bug_maneuver_strength(env); + if (env->bug_maneuver_mode > 0) { + env->bug_maneuver_phase += env->bug_maneuver_rate * dt; + if (env->bug_maneuver_phase > TWO_PI) { + env->bug_maneuver_phase -= TWO_PI; + } + } + + if (env->bug_inbound) { + float tx, ty; + norm_vec(env->x - env->bug_x, env->y - env->bug_y, &tx, &ty); + float px = -ty; + float py = tx; + float lateral = 0.0f; + if (env->bug_maneuver_mode > 0) { + lateral = strength * sinf(env->bug_maneuver_phase); + if (env->bug_maneuver_mode == 2) { + lateral += 0.5f * strength * env->bug_maneuver_sign; + } else if (env->bug_maneuver_mode == 3) { + lateral += 0.35f * strength * cosf(0.5f * env->bug_maneuver_phase); + } + } + lateral = bat_clampf(lateral, -0.50f, 0.50f); + float forward = sqrtf(fmaxf(0.0f, 1.0f - lateral * lateral)); + env->bug_vx = (tx * forward + px * lateral) * speed; + env->bug_vy = (ty * forward + py * lateral) * speed; + } else if (env->bug_maneuver_mode > 0) { + float heading = env->bug_base_heading; + if (env->bug_maneuver_mode == 1) { + heading += strength * sinf(env->bug_maneuver_phase); + } else if (env->bug_maneuver_mode == 2) { + env->bug_base_heading += env->bug_maneuver_sign * strength * dt; + heading = env->bug_base_heading; + } else { + heading += strength * sinf(env->bug_maneuver_phase) + + 0.35f * strength * cosf(0.5f * env->bug_maneuver_phase); + } + env->bug_vx = cosf(heading) * speed; + env->bug_vy = sinf(heading) * speed; + } + + env->bug_x += env->bug_vx * dt; + env->bug_y += env->bug_vy * dt; + bool bounced = false; + if (env->bug_x - BUG_RADIUS < 0.0f) { + env->bug_x = BUG_RADIUS; + env->bug_vx = fabsf(env->bug_vx); + bounced = true; + } + if (env->bug_x + BUG_RADIUS > ARENA_WIDTH) { + env->bug_x = ARENA_WIDTH - BUG_RADIUS; + env->bug_vx = -fabsf(env->bug_vx); + bounced = true; + } + if (env->bug_y - BUG_RADIUS < 0.0f) { + env->bug_y = BUG_RADIUS; + env->bug_vy = fabsf(env->bug_vy); + bounced = true; + } + if (env->bug_y + BUG_RADIUS > ARENA_HEIGHT) { + env->bug_y = ARENA_HEIGHT - BUG_RADIUS; + env->bug_vy = -fabsf(env->bug_vy); + bounced = true; + } + if (bounced) { + if (env->bug_inbound) { + float tx, ty; + norm_vec(env->x - env->bug_x, env->y - env->bug_y, &tx, &ty); + env->bug_vx = tx * speed; + env->bug_vy = ty * speed; + } + env->bug_base_heading = atan2f(env->bug_vy, env->bug_vx); + } +} + +static inline void update_motion(Bat* env, float dt) { + int move = (int)env->actions[ACTION_MOVE]; + int turn = (int)env->actions[ACTION_TURN]; + float fx = cosf(env->heading); + float fy = sinf(env->heading); + float speed = env->vx * fx + env->vy * fy; + if (speed < env->min_speed) speed = env->min_speed; + + if (move == THRUST_FORWARD) speed += env->accel * dt; + if (move == BRAKE) speed -= env->accel * dt; + speed = bat_clampf(speed, env->min_speed, env->max_speed); + + float turn_command = 0.0f; + if (turn == TURN_LEFT) turn_command = -1.0f; + if (turn == TURN_RIGHT) turn_command = 1.0f; + float speed_ratio = speed / env->max_speed; + env->turn_velocity = turn_command * env->turn_rate * bat_clampf(speed_ratio, 0.0f, 1.0f); + env->heading += env->turn_velocity * dt; + if (env->heading > PI_F) env->heading -= TWO_PI; + if (env->heading < -PI_F) env->heading += TWO_PI; + + env->vx = cosf(env->heading) * speed; + env->vy = sinf(env->heading) * speed; + env->x += env->vx * dt; + env->y += env->vy * dt; +} + +static inline bool try_emit_chirp(Bat* env) { + if (env->tick - env->last_chirp_tick < env->chirp_cooldown_ticks) { + return false; + } + + int start_idx = (int)env->actions[ACTION_CHIRP_FREQ_START]; + int end_idx = (int)env->actions[ACTION_CHIRP_FREQ_END]; + int duration_idx = (int)env->actions[ACTION_CHIRP_DURATION]; + + env->last_chirp_start_freq = norm_bin(start_idx, CHIRP_FREQ_BINS); + env->last_chirp_end_freq = norm_bin(end_idx, CHIRP_FREQ_BINS); + env->last_chirp_duration = norm_bin(duration_idx, CHIRP_DURATION_BINS); + env->last_chirp_tick = env->tick; + env->chirps_emitted += 1; + ChirpEvent* chirp = &env->chirps[env->chirp_head]; + chirp->x = env->x; + chirp->y = env->y; + chirp->start_freq = env->last_chirp_start_freq; + chirp->end_freq = env->last_chirp_end_freq; + chirp->duration = chirp_duration_seconds(env->last_chirp_duration); + chirp->birth_tick = env->tick; + chirp->slice_count = (int)ceilf(chirp->duration / TICK_RATE); + chirp->slices_scheduled = 0; + chirp->active = 1; + env->chirp_head = (env->chirp_head + 1) % CHIRP_HISTORY; + env->audio_chirp_serial += 1; + env->last_bug_echo_expected_tick = expected_bug_echo_tick(env, chirp); + return true; +} + +static inline float next_chirp_overlap_fraction(Bat* env) { + if (env->last_bug_echo_expected_tick <= (float)env->tick) return 0.0f; + float wait_ticks = env->last_bug_echo_expected_tick - (float)env->last_chirp_tick; + float remaining_ticks = env->last_bug_echo_expected_tick - (float)env->tick; + return bat_clampf(remaining_ticks / wait_ticks, 0.0f, 1.0f); +} + +static inline ChirpStatus update_chirp(Bat* env) { + int emit = (int)env->actions[ACTION_CHIRP_EMIT]; + if (emit) { + if (env->chirps_emitted >= MAX_CHIRPS_PER_EPISODE) { + return CHIRP_STATUS_OVER_BUDGET; + } + return try_emit_chirp(env) ? CHIRP_STATUS_EMITTED : CHIRP_STATUS_COOLDOWN; + } + + return CHIRP_STATUS_NONE; +} + +void c_step(Bat* env) { + env->rewards[0] = 0.0f; + env->terminals[0] = 0.0f; + float success = 0.0f; + float collision = 0.0f; + float timeout = 0.0f; + + float chirp_overlap_fraction = next_chirp_overlap_fraction(env); + ChirpStatus chirp_status = update_chirp(env); + if (chirp_status == CHIRP_STATUS_OVER_BUDGET) { + env->tick += 1; + env->rewards[0] = -1.0f; + collision = 1.0f; + } else { + schedule_due_chirp_slices(env); + + update_motion(env, TICK_RATE); + update_bug(env, TICK_RATE); + env->tick += 1; + if (hits_wall(env) || hits_obstacle(env)) { + env->rewards[0] = -env->collision_penalty; + collision = 1.0f; + } else if (dist(env->x, env->y, env->bug_x, env->bug_y) <= AGENT_RADIUS + BUG_RADIUS) { + env->rewards[0] = env->chirp_efficiency_reward * chirp_efficiency(env); + success = 1.0f; + } else { + float bug_dist = dist(env->x, env->y, env->bug_x, env->bug_y); + env->rewards[0] += env->progress_reward_scale * (env->prev_bug_dist - bug_dist); + env->rewards[0] -= env->step_cost; // TODO: Fold this only when we are ready to break training determinism. + if (chirp_status == CHIRP_STATUS_EMITTED) { + env->rewards[0] += env->valid_chirp_reward; // TODO: Remove this; chirps should only pay when bug echoes improve. + if (chirp_overlap_fraction > 0.0f) { + env->rewards[0] -= env->chirp_overlap_penalty * chirp_overlap_fraction; + } + } else if (chirp_status == CHIRP_STATUS_COOLDOWN) { + env->rewards[0] -= env->early_chirp_penalty; + } + env->prev_bug_dist = bug_dist; + + if (env->tick >= MAX_STEPS) { + env->rewards[0] = -1.0f; + timeout = 1.0f; + } + } + } + + if (success || collision || timeout) { + env->terminals[0] = 1.0f; + env->episode_return += env->rewards[0]; + if (success) { + advance_curriculum(env); + } + add_log(env, success, collision, timeout); + reset_episode(env); + return; + } + + compute_observations(env); + if (env->tick_bug_echo_path > 0.0f) { + if (env->last_bug_echo_path > 0.0f && dist(env->last_bug_echo_x, env->last_bug_echo_y, + env->x, env->y) >= BUG_ECHO_MIN_DISPLACEMENT) { + float echo_progress = (env->last_bug_echo_path - env->tick_bug_echo_path) + / MAX_ECHO_RANGE; + if (echo_progress > 0.0f) { + env->rewards[0] += env->bug_echo_reward_scale * echo_progress; + } else if (echo_progress < 0.0f) { + env->rewards[0] += env->bug_echo_reward_scale + * env->bug_echo_farther_penalty_scale * echo_progress; + } + } + env->last_bug_echo_path = env->tick_bug_echo_path; + env->last_bug_echo_x = env->x; + env->last_bug_echo_y = env->y; + } + env->episode_return += env->rewards[0]; +} + +#ifndef BAT_HEADLESS +static inline Color freq_color(float freq_norm, float alpha_norm) { + float mid = 1.0f - fabsf(2.0f * freq_norm - 1.0f); + return (Color){ + (unsigned char)(255.0f * (1.0f - freq_norm) + 45.0f * freq_norm), + (unsigned char)(45.0f + 180.0f * mid), + (unsigned char)(45.0f * (1.0f - freq_norm) + 255.0f * freq_norm), + (unsigned char)(255.0f * alpha_norm), + }; +} + +static inline void draw_chirp_rings(Bat* env, float sx, float sy) { + for (int i = 0; i < CHIRP_HISTORY; i++) { + ChirpEvent* chirp = &env->chirps[i]; + if (!chirp->active) continue; + + float age_seconds = (env->tick - chirp->birth_tick) * TICK_RATE; + if (age_seconds > MAX_ECHO_RANGE / env->sound_speed + chirp->duration) { + chirp->active = 0; + continue; + } + + for (int ring = 0; ring < CHIRP_RINGS; ring++) { + float slice = ring / (float)(CHIRP_RINGS - 1); + float freq = chirp->start_freq + slice * (chirp->end_freq - chirp->start_freq); + float ring_age = age_seconds - slice * chirp->duration; + if (ring_age <= 0.0f) continue; + float radius = env->sound_speed * ring_age; + if (radius > MAX_ECHO_RANGE) continue; + + float alpha = 0.18f + 0.42f * bat_clampf( + 1.0f - radius / MAX_ECHO_RANGE, 0.0f, 1.0f); + float source_x, source_y; + int slice_idx = (int)floorf(slice * (float)chirp->slice_count); + if (slice_idx >= chirp->slice_count) slice_idx = chirp->slice_count - 1; + chirp_source_for_slice(chirp, slice_idx, &source_x, &source_y); + DrawCircleLines( + (int)(source_x * sx), + (int)(source_y * sy), + radius * fminf(sx, sy), + freq_color(freq, alpha)); + } + } +} + +static inline Color doppler_ray_color(float doppler, float alpha) { + if (doppler > 0.05f) { + return freq_color(1.0f, alpha); + } else if (doppler < -0.05f) { + return freq_color(0.0f, alpha); + } + return (Color){210, 210, 220, + (unsigned char)(255.0f * bat_clampf(alpha, 0.0f, 1.0f))}; +} + +static inline void clear_freq_history(Client* client) { + memset(client->freq_history, 0, sizeof(client->freq_history)); + client->freq_history_head = 0; + client->freq_history_last_tick = -1; +} + +static inline void capture_freq_history(Bat* env) { + Client* client = env->client; + if (env->tick < client->freq_history_last_tick) { + clear_freq_history(client); + } + if (env->tick == client->freq_history_last_tick) return; + + float (*sample)[FREQ_BINS] = client->freq_history[client->freq_history_head]; + for (int i = 0; i < FREQ_BINS; i++) { + sample[0][i] = env->observations[LEFT_FREQ_OFFSET + i]; + sample[1][i] = env->observations[RIGHT_FREQ_OFFSET + i]; + } + + client->freq_history_head = (client->freq_history_head + 1) % FREQ_HISTORY_TICKS; + client->freq_history_last_tick = env->tick; +} + +static inline Color freq_history_color(int bin, float energy) { + float e = sqrtf(bat_clampf(energy, 0.0f, 1.0f)); + if (e <= 0.001f) return (Color){42, 46, 56, 255}; + + Color base = freq_color(bin / (float)(FREQ_BINS - 1), 1.0f); + float brightness = 0.25f + 0.75f * e; + return (Color){ + (unsigned char)(36.0f + 219.0f * (base.r / 255.0f) * brightness), + (unsigned char)(36.0f + 219.0f * (base.g / 255.0f) * brightness), + (unsigned char)(36.0f + 219.0f * (base.b / 255.0f) * brightness), + 255, + }; +} + +static inline void draw_freq_history_band(Client* client, + int ear, int x, int y, int width, int height) { + float col_width = width / (float)FREQ_HISTORY_TICKS; + float row_height = height / (float)FREQ_BINS; + for (int t = 0; t < FREQ_HISTORY_TICKS; t++) { + int history_idx = (client->freq_history_head + FREQ_HISTORY_TICKS - 1 - t) + % FREQ_HISTORY_TICKS; + int x0 = x + (int)(t * col_width); + int x1 = x + (int)((t + 1) * col_width); + if (x1 <= x0) x1 = x0 + 1; + + for (int row = 0; row < FREQ_BINS; row++) { + int bin = FREQ_BINS - 1 - row; + int y0 = y + (int)(row * row_height); + int y1 = y + (int)((row + 1) * row_height); + if (y1 <= y0) y1 = y0 + 1; + DrawRectangle(x0, y0, x1 - x0, y1 - y0, + freq_history_color(bin, client->freq_history[history_idx][ear][bin])); + } + } +} + +typedef struct ObsBar { + const char* label; + int obs_idx; + Color color; + bool signed_value; +} ObsBar; + +static inline void draw_obs_bar(int x, int y, int width, + const ObsBar* bar, const float* observations) { + const int label_width = 68; + const int bar_height = 12; + int bar_x = x + label_width; + int bar_width = width - label_width; + + DrawText(bar->label, x, y - 1, 10, (Color){226, 230, 238, 255}); + DrawRectangle(bar_x, y, bar_width, bar_height, (Color){48, 52, 62, 255}); + + if (bar->signed_value) { + int center = bar_x + bar_width / 2; + float value = bat_clampf(observations[bar->obs_idx], -1.0f, 1.0f); + int fill = (int)(fabsf(value) * bar_width * 0.5f); + if (value >= 0.0f) { + DrawRectangle(center, y, fill, bar_height, bar->color); + } else { + DrawRectangle(center - fill, y, fill, bar_height, bar->color); + } + DrawLine(center, y, center, y + bar_height, (Color){196, 200, 210, 255}); + } else { + float value = bat_clampf(observations[bar->obs_idx], 0.0f, 1.0f); + DrawRectangle(bar_x, y, (int)(value * bar_width), bar_height, bar->color); + } + + DrawRectangleLines(bar_x, y, bar_width, bar_height, (Color){118, 126, 142, 255}); +} + +static inline void draw_arrow_line(int x0, int y0, int x1, int y1, Color color) { + DrawLine(x0, y0, x1, y1, color); + float angle = atan2f((float)(y1 - y0), (float)(x1 - x0)); + const float head = 7.0f; + DrawLine(x1, y1, + (int)(x1 - cosf(angle - 0.45f) * head), + (int)(y1 - sinf(angle - 0.45f) * head), color); + DrawLine(x1, y1, + (int)(x1 - cosf(angle + 0.45f) * head), + (int)(y1 - sinf(angle + 0.45f) * head), color); +} + +static inline void draw_observation_bars(Bat* env, int x, int y, int width) { + static const ObsBar chirp_bars[] = { + {"age", CHIRP_AGE_OBS, {112, 196, 255, 255}, false}, + {"cooldown", CHIRP_COOLDOWN_OBS, {255, 206, 96, 255}, false}, + {"start", CHIRP_START_OBS, {255, 112, 160, 255}, false}, + {"end", CHIRP_END_OBS, {126, 224, 255, 255}, false}, + {"duration", CHIRP_DURATION_OBS, {190, 154, 255, 255}, false}, + {"used", CHIRPS_USED_OBS, {255, 150, 96, 255}, false}, + }; + static const ObsBar action_bars[] = { + {"speed", FORWARD_SPEED_OBS, {120, 226, 142, 255}, false}, + {"turn", TURN_RATE_OBS, {255, 112, 112, 255}, true}, + }; + static const ObsBar timer_bar = {"timer", TIMER_OBS, {88, 164, 255, 255}, false}; + + const int row_step = 18; + const Color header = (Color){246, 248, 255, 255}; + int chirp_count = (int)(sizeof(chirp_bars) / sizeof(chirp_bars[0])); + int action_count = (int)(sizeof(action_bars) / sizeof(action_bars[0])); + + DrawText("Chirp", x, y, 12, header); + y += 18; + for (int i = 0; i < chirp_count; i++) { + draw_obs_bar(x, y + i * row_step, width, &chirp_bars[i], env->observations); + } + y += chirp_count * row_step + 14; + + DrawText("Actions", x, y, 12, header); + y += 18; + for (int i = 0; i < action_count; i++) { + draw_obs_bar(x, y + i * row_step, width, &action_bars[i], env->observations); + } + y += action_count * row_step + 14; + + DrawText("Episode", x, y, 12, header); + y += 18; + draw_obs_bar(x, y, width, &timer_bar, env->observations); +} + +static inline void draw_freq_history_panel(Bat* env, int x, int y, int width, int height) { + capture_freq_history(env); + + DrawRectangle(x, y, width, height, (Color){32, 36, 46, 255}); + int band_width = FREQ_WATERFALL_WIDTH - 2 * FREQ_PANEL_MARGIN; + int band_height = (height - 3 * FREQ_PANEL_MARGIN) / 2; + int left_y = y + FREQ_PANEL_MARGIN; + int right_y = left_y + band_height + FREQ_PANEL_MARGIN; + int obs_x = x + FREQ_WATERFALL_WIDTH + FREQ_PANEL_MARGIN; + int obs_width = width - FREQ_WATERFALL_WIDTH - 2 * FREQ_PANEL_MARGIN; + + draw_freq_history_band(env->client, 0, x + FREQ_PANEL_MARGIN, left_y, + band_width, band_height); + draw_freq_history_band(env->client, 1, x + FREQ_PANEL_MARGIN, right_y, + band_width, band_height); + draw_observation_bars(env, obs_x, y + FREQ_PANEL_MARGIN, obs_width); + Color reflection_color = (Color){255, 96, 96, 255}; + int reflection_text_x = obs_x + 40; + int reflection_text_y = (left_y + right_y + band_height) / 2 - 6; + int reflection_source_x = reflection_text_x - 8; + int reflection_source_y = reflection_text_y + 8; + int reflection_target_x = x + FREQ_PANEL_MARGIN + band_width - 4; + DrawText("Reflections L/R", reflection_text_x, reflection_text_y, 12, reflection_color); + draw_arrow_line(reflection_source_x, reflection_source_y, + reflection_target_x, left_y + band_height / 2, reflection_color); + draw_arrow_line(reflection_source_x, reflection_source_y + 10, + reflection_target_x, right_y + band_height / 2, reflection_color); + + DrawRectangleLines(x, y, width, height, (Color){124, 132, 148, 255}); + DrawRectangleLines(x + FREQ_PANEL_MARGIN, left_y, band_width, band_height, + (Color){102, 110, 126, 255}); + DrawRectangleLines(x + FREQ_PANEL_MARGIN, right_y, band_width, band_height, + (Color){102, 110, 126, 255}); + DrawLine(x + FREQ_WATERFALL_WIDTH, y, x + FREQ_WATERFALL_WIDTH, y + height, + (Color){86, 94, 110, 255}); +} + +static inline void draw_echo_flash(Bat* env, ChirpEvent* chirp, + float rx, float ry, float rvx, float rvy, float strength, + float sx, float sy) { + float age_seconds = (env->tick - chirp->birth_tick) * TICK_RATE; + float distance = dist(chirp->x, chirp->y, rx, ry); + float echo_time = 2.0f * distance / env->sound_speed; + if (fabsf(age_seconds - echo_time) > 0.025f) return; + + float ux, uy; + norm_vec(rx - chirp->x, ry - chirp->y, &ux, &uy); + float rel_vx = rvx - env->vx; + float rel_vy = rvy - env->vy; + float distance_rate = rel_vx * ux + rel_vy * uy; + float doppler = bat_clampf(-distance_rate / (env->max_speed + BUG_SPEED), -1.0f, 1.0f); + float amp = strength / (1.0f + 0.02f * distance * distance); + float alpha = bat_clampf(0.20f + amp * 2.0f, 0.20f, 0.90f); + Color color = doppler_ray_color(doppler, alpha); + + DrawLine((int)(chirp->x * sx), (int)(chirp->y * sy), + (int)(rx * sx), (int)(ry * sy), color); + DrawCircleLines((int)(rx * sx), (int)(ry * sy), + fmaxf(3.0f, 8.0f * alpha), color); +} + +static inline void draw_segment_echoes(Bat* env, ChirpEvent* chirp, + float x1, float y1, float x2, float y2, float strength, + float sx, float sy) { + float len = dist(x1, y1, x2, y2); + int count = (int)(len / REFLECTOR_SPACING) + 1; + for (int i = 0; i <= count; i++) { + float t = i / (float)count; + float x = x1 + (x2 - x1) * t; + float y = y1 + (y2 - y1) * t; + draw_echo_flash(env, chirp, x, y, 0.0f, 0.0f, strength, sx, sy); + } +} + +static inline void draw_obstacle_echoes(Bat* env, ChirpEvent* chirp, + int i, float sx, float sy) { + float x = env->obstacle_x[i]; + float y = env->obstacle_y[i]; + float w = env->obstacle_w[i]; + float h = env->obstacle_h[i]; + draw_segment_echoes(env, chirp, x, y, x + w, y, 0.55f, sx, sy); + draw_segment_echoes(env, chirp, x, y + h, x + w, y + h, 0.55f, sx, sy); + draw_segment_echoes(env, chirp, x, y, x, y + h, 0.55f, sx, sy); + draw_segment_echoes(env, chirp, x + w, y, x + w, y + h, 0.55f, sx, sy); +} + +static inline void draw_corner_reflector_echoes(Bat* env, ChirpEvent* chirp, + float sx, float sy) { + float w = (float)ARENA_WIDTH; + float h = (float)ARENA_HEIGHT; + for (int i = 0; i < ARENA_REFLECTORS; i++) { + draw_echo_flash(env, chirp, ARENA_REFLECTOR_X[i] * w, + ARENA_REFLECTOR_Y[i] * h, 0.0f, 0.0f, env->reflector_strength, sx, sy); + } +} + +static inline void draw_corner_reflector_markers(int width, int height) { + const int size = 8; + const Color fill = (Color){128, 128, 132, 255}; + const Color outline = (Color){202, 202, 208, 255}; + int max_x = width - size; + int max_y = height - size; + for (int i = 0; i < ARENA_REFLECTORS; i++) { + int x = (int)(ARENA_REFLECTOR_X[i] * max_x); + int y = (int)(ARENA_REFLECTOR_Y[i] * max_y); + DrawRectangle(x, y, size, size, fill); + DrawRectangleLines(x, y, size, size, outline); + } +} + +static inline void draw_echo_reflections(Bat* env, float sx, float sy) { + for (int i = 0; i < CHIRP_HISTORY; i++) { + ChirpEvent* chirp = &env->chirps[i]; + if (!chirp->active) continue; + draw_echo_flash(env, chirp, env->bug_x, env->bug_y, + env->bug_vx, env->bug_vy, 4.0f, sx, sy); + draw_segment_echoes(env, chirp, 0.0f, 0.0f, (float)ARENA_WIDTH, 0.0f, 0.18f, sx, sy); + draw_segment_echoes(env, chirp, 0.0f, (float)ARENA_HEIGHT, (float)ARENA_WIDTH, (float)ARENA_HEIGHT, 0.18f, sx, sy); + draw_segment_echoes(env, chirp, 0.0f, 0.0f, 0.0f, (float)ARENA_HEIGHT, 0.18f, sx, sy); + draw_segment_echoes(env, chirp, (float)ARENA_WIDTH, 0.0f, (float)ARENA_WIDTH, (float)ARENA_HEIGHT, 0.18f, sx, sy); + draw_corner_reflector_echoes(env, chirp, sx, sy); + for (int j = 0; j < env->num_obstacles; j++) { + draw_obstacle_echoes(env, chirp, j, sx, sy); + } + } +} + +#include "bat_record.h" + +Client* make_client(Bat* env) { + Client* client = (Client*)calloc(1, sizeof(Client)); + client->width = ARENA_WIDTH * 10 + FREQ_PANEL_WIDTH; + client->height = ARENA_HEIGHT * 10; + clear_freq_history(client); + InitWindow(client->width, client->height, "Bat"); + SetTargetFPS(env->render_target_fps); + InitAudioDevice(); + client->audio_ready = IsAudioDeviceReady(); + record_init(env, client); + return client; +} + +void close_client(Client* client) { + record_finalize(client); + if (client->audio_ready) { + for (int i = 0; i < AUDIO_VOICES; i++) { + unload_chirp_sound(client, i); + } + CloseAudioDevice(); + } + CloseWindow(); + free(client); +} + +void c_render(Bat* env) { + if (IsKeyPressed(KEY_ESCAPE)) { + exit(0); + } + if (env->client == NULL) { + env->client = make_client(env); + } + play_chirp_audio(env); + int arena_width = env->client->width - FREQ_PANEL_WIDTH; + int arena_height = env->client->height; + float sx = arena_width / (float)ARENA_WIDTH; + float sy = env->client->height / (float)ARENA_HEIGHT; + BeginDrawing(); + ClearBackground((Color){18, 20, 24, 255}); + draw_chirp_rings(env, sx, sy); + draw_echo_reflections(env, sx, sy); + DrawRectangleLines(0, 0, arena_width, arena_height, GRAY); + for (int i = 0; i < env->num_obstacles; i++) { + DrawRectangle( + (int)(env->obstacle_x[i] * sx), + (int)(env->obstacle_y[i] * sy), + (int)(env->obstacle_w[i] * sx), + (int)(env->obstacle_h[i] * sy), + (Color){92, 92, 96, 255}); + } + draw_corner_reflector_markers(arena_width, arena_height); + DrawCircle((int)(env->bug_x * sx), (int)(env->bug_y * sy), + BUG_RADIUS * sx, GREEN); + DrawCircle((int)(env->x * sx), (int)(env->y * sy), + AGENT_RADIUS * sx, BLUE); + float hx = env->x + cosf(env->heading) * AGENT_RADIUS * 2.0f; + float hy = env->y + sinf(env->heading) * AGENT_RADIUS * 2.0f; + DrawLine((int)(env->x * sx), (int)(env->y * sy), (int)(hx * sx), (int)(hy * sy), WHITE); + int cooldown = env->chirp_cooldown_ticks - (env->tick - env->last_chirp_tick); + DrawText(TextFormat("reward %.3f tick %d chirps %d cooldown %d ESC exits", env->rewards[0], env->tick, + env->chirps_emitted, cooldown), 10, 10, 20, RAYWHITE); + draw_freq_history_panel(env, arena_width, 0, FREQ_PANEL_WIDTH, arena_height); + EndDrawing(); + record_capture_frame(env); +} +#else +Client* make_client(Bat* env) { + (void)env; + return NULL; +} + +void close_client(Client* client) { + (void)client; +} + +void c_render(Bat* env) { + (void)env; +} +#endif diff --git a/ocean/bat/bat_audio.h b/ocean/bat/bat_audio.h new file mode 100644 index 0000000000..5010775fc3 --- /dev/null +++ b/ocean/bat/bat_audio.h @@ -0,0 +1,96 @@ +#ifndef BAT_AUDIO_H +#define BAT_AUDIO_H + +static inline float chirp_audio_duration_at_fps(float duration_norm, int fps) { + float duration = chirp_duration_seconds(duration_norm); + float scale = 60.0f / (float)fps; + if (scale < 1.0f) scale = 1.0f; + return duration * scale; +} + +static inline float chirp_audio_duration_seconds(Bat* env, float duration_norm) { + return chirp_audio_duration_at_fps(duration_norm, env->render_target_fps); +} + +static inline float chirp_audio_frequency_hz(float freq_norm) { + return AUDIO_MIN_HZ + freq_norm + * (AUDIO_MAX_HZ - AUDIO_MIN_HZ); +} + +static inline float chirp_audio_envelope(float t_norm) { + if (t_norm <= 0.0f || t_norm >= 1.0f) return 0.0f; + return bat_clampf(fminf(t_norm / AUDIO_ENVELOPE_FADE, + (1.0f - t_norm) / AUDIO_ENVELOPE_FADE), 0.0f, 1.0f); +} + +static inline float chirp_audio_sample_f32(float start_norm, float end_norm, + float duration_seconds, int sample_index, int sample_rate) { + float t = sample_index / (float)sample_rate; + if (t >= duration_seconds) return 0.0f; + + float start_hz = chirp_audio_frequency_hz(start_norm); + float end_hz = chirp_audio_frequency_hz(end_norm); + float chirp_rate = (end_hz - start_hz) / duration_seconds; + float phase = TWO_PI * (start_hz * t + 0.5f * chirp_rate * t * t); + float envelope = chirp_audio_envelope(t / duration_seconds); + return AUDIO_VOLUME * envelope * sinf(phase); +} + +#ifndef BAT_HEADLESS +static inline void unload_chirp_sound(Client* client, int i) { + if (!client->chirp_sound_loaded[i]) return; + UnloadSound(client->chirp_sounds[i]); + client->chirp_sound_loaded[i] = 0; +} + +static inline void cleanup_audio(Client* client) { + if (!client->audio_ready) return; + for (int i = 0; i < AUDIO_VOICES; i++) { + if (client->chirp_sound_loaded[i] && !IsSoundPlaying(client->chirp_sounds[i])) { + unload_chirp_sound(client, i); + } + } +} + +static inline void play_chirp_audio(Bat* env) { + Client* client = env->client; + if (client == NULL || !client->audio_ready) return; + cleanup_audio(client); + if (env->audio_chirp_serial <= 0 || + env->audio_chirp_serial == client->last_audio_chirp_serial) { + return; + } + client->last_audio_chirp_serial = env->audio_chirp_serial; + + float duration = chirp_audio_duration_seconds(env, env->last_chirp_duration); + int sample_count = (int)ceilf(duration * AUDIO_SAMPLE_RATE); + + short* samples = (short*)malloc(sample_count * sizeof(short)); + if (samples == NULL) return; + for (int i = 0; i < sample_count; i++) { + float sample = chirp_audio_sample_f32(env->last_chirp_start_freq, + env->last_chirp_end_freq, duration, i, AUDIO_SAMPLE_RATE); + samples[i] = (short)(bat_clampf(sample, -1.0f, 1.0f) * 32767.0f); + } + + Wave wave = { + .frameCount = (unsigned int)sample_count, + .sampleRate = AUDIO_SAMPLE_RATE, + .sampleSize = 16, + .channels = 1, + .data = samples, + }; + Sound sound = LoadSoundFromWave(wave); + UnloadWave(wave); + + int voice = client->audio_voice_cursor; + client->audio_voice_cursor = (client->audio_voice_cursor + 1) % AUDIO_VOICES; + unload_chirp_sound(client, voice); + client->chirp_sounds[voice] = sound; + client->chirp_sound_loaded[voice] = 1; + SetSoundVolume(client->chirp_sounds[voice], 1.0f); + PlaySound(client->chirp_sounds[voice]); +} +#endif + +#endif diff --git a/ocean/bat/bat_record.h b/ocean/bat/bat_record.h new file mode 100644 index 0000000000..0ba923a18f --- /dev/null +++ b/ocean/bat/bat_record.h @@ -0,0 +1,164 @@ +#ifndef BAT_RECORD_H +#define BAT_RECORD_H + +static inline void record_write_le16(FILE* f, unsigned int v) { + fputc((int)(v & 0xffu), f); + fputc((int)((v >> 8) & 0xffu), f); +} + +static inline void record_write_le32(FILE* f, unsigned int v) { + fputc((int)(v & 0xffu), f); + fputc((int)((v >> 8) & 0xffu), f); + fputc((int)((v >> 16) & 0xffu), f); + fputc((int)((v >> 24) & 0xffu), f); +} + +static inline void record_write_wav_header(FILE* f, int data_bytes) { + int byte_rate = AUDIO_SAMPLE_RATE * 2; + fwrite("RIFF", 1, 4, f); + record_write_le32(f, 36u + (unsigned int)data_bytes); + fwrite("WAVE", 1, 4, f); + fwrite("fmt ", 1, 4, f); + record_write_le32(f, 16); + record_write_le16(f, 1); + record_write_le16(f, 1); + record_write_le32(f, AUDIO_SAMPLE_RATE); + record_write_le32(f, (unsigned int)byte_rate); + record_write_le16(f, 2); + record_write_le16(f, 16); + fwrite("data", 1, 4, f); + record_write_le32(f, (unsigned int)data_bytes); +} + +static inline void record_init(Bat* env, Client* client) { + if (!env->record_video || client->recording_initialized) return; + client->recording_initialized = 1; + client->record_fps = env->record_video_fps; + client->record_audio = env->record_video_audio ? 1 : 0; + client->record_max_frames = client->record_fps * env->record_video_seconds; + snprintf(client->record_frame_dir, sizeof(client->record_frame_dir), + "recordings/bat_recording_frames"); + snprintf(client->record_wav_path, sizeof(client->record_wav_path), + "recordings/bat_recording.wav"); + snprintf(client->record_mp4_path, sizeof(client->record_mp4_path), + "recordings/bat_recording.mp4"); + system("mkdir -p recordings recordings/bat_recording_frames"); + if (client->record_audio) { + client->record_wav = fopen(client->record_wav_path, "wb"); + if (client->record_wav != NULL) { + record_write_wav_header(client->record_wav, 0); + } + } + printf("Bat recording enabled: %s (%d fps, %d frames)\n", + client->record_mp4_path, client->record_fps, client->record_max_frames); +} + +static inline void record_enqueue_chirp(Bat* env) { + Client* client = env->client; + if (client == NULL || !client->recording_initialized || + client->recording_finalized || !client->record_audio) { + return; + } + if (env->audio_chirp_serial <= 0 || + env->audio_chirp_serial == client->record_last_audio_chirp_serial) { + return; + } + client->record_last_audio_chirp_serial = env->audio_chirp_serial; + int voice_idx = client->record_voice_cursor; + client->record_voice_cursor = (client->record_voice_cursor + 1) % RECORD_MAX_VOICES; + BatRecordVoice* voice = &client->record_voices[voice_idx]; + voice->active = 1; + voice->start_sample = client->record_audio_sample_cursor; + voice->start_freq = env->last_chirp_start_freq; + voice->end_freq = env->last_chirp_end_freq; + voice->duration = chirp_audio_duration_at_fps( + env->last_chirp_duration, client->record_fps); +} + +static inline void record_append_audio_frame(Bat* env) { + Client* client = env->client; + if (client == NULL || !client->record_audio || client->record_wav == NULL) return; + int frame_samples = AUDIO_SAMPLE_RATE / client->record_fps; + for (int i = 0; i < frame_samples; i++) { + int sample_index = client->record_audio_sample_cursor + i; + float mixed = 0.0f; + for (int v = 0; v < RECORD_MAX_VOICES; v++) { + BatRecordVoice* voice = &client->record_voices[v]; + if (!voice->active) continue; + int local_sample = sample_index - voice->start_sample; + int voice_samples = (int)ceilf(voice->duration * AUDIO_SAMPLE_RATE); + if (local_sample < 0) continue; + if (local_sample >= voice_samples) { + voice->active = 0; + continue; + } + mixed += chirp_audio_sample_f32(voice->start_freq, voice->end_freq, + voice->duration, local_sample, AUDIO_SAMPLE_RATE); + } + short pcm = (short)(bat_clampf(mixed, -1.0f, 1.0f) * 32767.0f); + fwrite(&pcm, sizeof(short), 1, client->record_wav); + client->record_audio_data_bytes += (int)sizeof(short); + } + client->record_audio_sample_cursor += frame_samples; +} + +static inline void record_finalize(Client* client) { + if (client == NULL || !client->recording_initialized || + client->recording_finalized) { + return; + } + client->recording_finalized = 1; + if (client->record_wav != NULL) { + fseek(client->record_wav, 0, SEEK_SET); + record_write_wav_header(client->record_wav, client->record_audio_data_bytes); + fclose(client->record_wav); + client->record_wav = NULL; + } + + char cmd[1024]; + if (client->record_audio) { + snprintf(cmd, sizeof(cmd), + "ffmpeg -y -framerate %d -i %s/%%06d.png -i %s -frames:v %d " + "-c:v libx264 -pix_fmt yuv420p -c:a aac -shortest %s", + client->record_fps, client->record_frame_dir, client->record_wav_path, + client->record_frame, client->record_mp4_path); + } else { + snprintf(cmd, sizeof(cmd), + "ffmpeg -y -framerate %d -i %s/%%06d.png -frames:v %d " + "-c:v libx264 -pix_fmt yuv420p %s", + client->record_fps, client->record_frame_dir, client->record_frame, + client->record_mp4_path); + } + int status = system(cmd); + if (status == 0) { + printf("Bat recording saved: %s\n", client->record_mp4_path); + } else { + printf("Bat recording ffmpeg command failed with status %d\n", status); + } +} + +static inline void record_capture_frame(Bat* env) { + Client* client = env->client; + if (client == NULL || !client->recording_initialized || + client->recording_finalized) { + return; + } + if (client->record_frame >= client->record_max_frames) { + record_finalize(client); + return; + } + record_enqueue_chirp(env); + char path[512]; + snprintf(path, sizeof(path), "%s/%06d.png", client->record_frame_dir, + client->record_frame); + Image image = LoadImageFromScreen(); + ExportImage(image, path); + UnloadImage(image); + record_append_audio_frame(env); + client->record_frame += 1; + if (client->record_frame >= client->record_max_frames) { + record_finalize(client); + } +} + +#endif diff --git a/ocean/bat/binding.c b/ocean/bat/binding.c new file mode 100644 index 0000000000..02ecf722a7 --- /dev/null +++ b/ocean/bat/binding.c @@ -0,0 +1,58 @@ +#include "bat.h" +#define NUM_ATNS NUM_ACTIONS +#define ACT_SIZES {MOVE_ACTIONS, TURN_ACTIONS, CHIRP_FREQ_BINS, CHIRP_FREQ_BINS, CHIRP_DURATION_BINS, CHIRP_EMIT_ACTIONS} +#define OBS_TENSOR_T FloatTensor + +#define Env Bat +#include "vecenv.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = NUM_AGENTS; + env->ear_separation_scale = dict_get(kwargs, "ear_separation_scale")->value; + env->ear_rear_gain = dict_get(kwargs, "ear_rear_gain")->value; + env->ear_front_gain = dict_get(kwargs, "ear_front_gain")->value; + env->ear_side_gain = dict_get(kwargs, "ear_side_gain")->value; + env->max_speed = dict_get(kwargs, "max_speed")->value; + env->min_speed = dict_get(kwargs, "min_speed")->value; + env->accel = dict_get(kwargs, "accel")->value; + env->turn_rate = dict_get(kwargs, "turn_rate")->value; + env->render_target_fps = dict_get(kwargs, "render_target_fps")->value; + env->record_video = dict_get(kwargs, "record_video")->value; + env->record_video_fps = dict_get(kwargs, "record_video_fps")->value; + env->record_video_seconds = dict_get(kwargs, "record_video_seconds")->value; + env->record_video_audio = dict_get(kwargs, "record_video_audio")->value; + env->curriculum_initial_level = dict_get(kwargs, "curriculum_initial_level")->value; + env->curriculum_obstacle_step = dict_get(kwargs, "curriculum_obstacle_step")->value; + env->curriculum_successes_per_level = dict_get(kwargs, "curriculum_successes_per_level")->value; + env->curriculum_start_bug_distance = dict_get(kwargs, "curriculum_start_bug_distance")->value; + env->sound_speed = dict_get(kwargs, "sound_speed")->value; + env->reflector_strength = dict_get(kwargs, "reflector_strength")->value; + env->chirp_cooldown_ticks = dict_get(kwargs, "chirp_cooldown_ticks")->value; + env->chirp_efficiency_reward = dict_get(kwargs, "chirp_efficiency_reward")->value; + env->valid_chirp_reward = dict_get(kwargs, "valid_chirp_reward")->value; + env->early_chirp_penalty = dict_get(kwargs, "early_chirp_penalty")->value; + env->chirp_overlap_penalty = dict_get(kwargs, "chirp_overlap_penalty")->value; + env->bug_echo_reward_scale = dict_get(kwargs, "bug_echo_reward_scale")->value; + env->bug_echo_farther_penalty_scale = dict_get(kwargs, "bug_echo_farther_penalty_scale")->value; + env->bug_wing_sideband_gain = dict_get(kwargs, "bug_wing_sideband_gain")->value; + env->step_cost = dict_get(kwargs, "step_cost")->value; + env->progress_reward_scale = dict_get(kwargs, "progress_reward_scale")->value; + env->collision_penalty = dict_get(kwargs, "collision_penalty")->value; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "base_perf", log->base_perf); + dict_set(out, "collision", log->collision); + dict_set(out, "timeout", log->timeout); + dict_set(out, "curriculum_level", log->curriculum_level); + dict_set(out, "curriculum_difficulty", log->curriculum_difficulty); + dict_set(out, "curriculum_perf", log->curriculum_perf); + dict_set(out, "num_obstacles", log->num_obstacles); + dict_set(out, "chirps_emitted", log->chirps_emitted); + dict_set(out, "chirp_perf", log->chirp_perf); +} diff --git a/ocean/bat/tests/run_all.sh b/ocean/bat/tests/run_all.sh new file mode 100644 index 0000000000..f693ea1f27 --- /dev/null +++ b/ocean/bat/tests/run_all.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -euo pipefail + +cd "$(dirname "$0")/../../.." + +mkdir -p build/bat-tests +cc -std=c99 -O2 -Wall -Wextra -DBAT_HEADLESS \ + -I. -Iocean/bat -Ivendor -Iraylib-5.5_linux_amd64/include \ + ocean/bat/tests/test_bat_core.c \ + -lm \ + -o build/bat-tests/test_bat_core + +build/bat-tests/test_bat_core diff --git a/ocean/bat/tests/test_bat_core.c b/ocean/bat/tests/test_bat_core.c new file mode 100644 index 0000000000..40c04b5ca3 --- /dev/null +++ b/ocean/bat/tests/test_bat_core.c @@ -0,0 +1,1836 @@ +#include +#include +#include + +#include "../bat.h" + +#define ASSERT_TRUE(cond) do { \ + if (!(cond)) { \ + printf("ASSERT_TRUE failed at %s:%d: %s\n", __FILE__, __LINE__, #cond); \ + return 1; \ + } \ +} while (0) + +#define ASSERT_FLOAT_NEAR(actual, expected, eps) do { \ + float _a = (actual); \ + float _e = (expected); \ + if (fabsf(_a - _e) > (eps)) { \ + printf("ASSERT_FLOAT_NEAR failed at %s:%d: got %.6f expected %.6f\n", \ + __FILE__, __LINE__, _a, _e); \ + return 1; \ + } \ +} while (0) + +static Bat make_test_env(void) { + Bat env = { + .num_agents = 1, + .num_obstacles = 1, + .ear_separation_scale = 0.75f, + .ear_rear_gain = 0.20f, + .ear_front_gain = 0.55f, + .ear_side_gain = 0.35f, + .max_speed = 12.0f, + .min_speed = 2.4f, + .accel = 30.0f, + .turn_rate = 3.1415926f, + .sound_speed = 100.0f, + .reflector_strength = 2.0f, + .chirp_cooldown_ticks = 12, + .chirp_efficiency_reward = 1.0f, + .step_cost = 0.001f, + .progress_reward_scale = 0.05f, + .collision_penalty = 1.0f, + .valid_chirp_reward = 0.0005f, + .early_chirp_penalty = 0.001f, + .bug_echo_farther_penalty_scale = 0.10f, + .bug_wing_sideband_gain = 0.10f, + .curriculum_obstacle_step = 8, + .curriculum_successes_per_level = 1, + .curriculum_start_bug_distance = 14.0f, + .rng = 1, + }; + allocate(&env); + return env; +} + +static int test_chirp_metadata_and_observation_size(void) { + Bat env = make_test_env(); + c_reset(&env); + + env.actions[0] = 0.0f; + env.actions[1] = 0.0f; + env.actions[2] = 7.0f; + env.actions[3] = 0.0f; + env.actions[4] = 3.0f; + env.actions[5] = 1.0f; + c_step(&env); + + ASSERT_FLOAT_NEAR(env.observations[CHIRP_START_OBS], 1.0f, 0.0001f); + ASSERT_FLOAT_NEAR(env.observations[CHIRP_END_OBS], 0.0f, 0.0001f); + ASSERT_FLOAT_NEAR(env.observations[CHIRP_DURATION_OBS], 1.0f, 0.0001f); + ASSERT_TRUE(env.observations[CHIRP_AGE_OBS] <= 1.0f); + ASSERT_TRUE(env.observations[CHIRP_AGE_OBS] >= 0.0f); + + free_allocated(&env); + return 0; +} + +static int test_chirps_used_observation_tracks_emitted_chirps(void) { + Bat env = make_test_env(); + c_reset(&env); + + ASSERT_FLOAT_NEAR(env.observations[CHIRPS_USED_OBS], 0.0f, 0.0001f); + + env.actions[2] = 0.0f; + env.actions[3] = 7.0f; + env.actions[4] = 1.0f; + env.actions[5] = 1.0f; + c_step(&env); + + ASSERT_TRUE(env.chirps_emitted == 1); + ASSERT_FLOAT_NEAR(env.observations[CHIRPS_USED_OBS], + 1.0f / (float)MAX_CHIRPS_PER_EPISODE, 0.0001f); + + env.chirps_emitted = MAX_CHIRPS_PER_EPISODE + 1; + compute_observations(&env); + ASSERT_FLOAT_NEAR(env.observations[CHIRPS_USED_OBS], 1.0f, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_max_chirps_stays_fixed_with_curriculum_level(void) { + Bat env = make_test_env(); + env.curriculum_initial_level = 8; + c_reset(&env); + + ASSERT_TRUE(env.curriculum_level == 8); + env.chirps_emitted = 1; + compute_observations(&env); + ASSERT_FLOAT_NEAR(env.observations[CHIRPS_USED_OBS], + 1.0f / (float)MAX_CHIRPS_PER_EPISODE, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_chirping_after_budget_terminates_with_penalty(void) { + Bat env = make_test_env(); + env.chirp_cooldown_ticks = 5; + env.early_chirp_penalty = 0.0f; + c_reset(&env); + env.chirps_emitted = MAX_CHIRPS_PER_EPISODE - 1; + compute_observations(&env); + + env.actions[2] = 0.0f; + env.actions[3] = 7.0f; + env.actions[4] = 1.0f; + env.actions[5] = 1.0f; + c_step(&env); + ASSERT_TRUE(env.terminals[0] == 0.0f); + ASSERT_TRUE(env.chirps_emitted == MAX_CHIRPS_PER_EPISODE); + ASSERT_FLOAT_NEAR(env.observations[CHIRPS_USED_OBS], 1.0f, 0.0001f); + + c_step(&env); + + ASSERT_TRUE(env.terminals[0] == 1.0f); + ASSERT_FLOAT_NEAR(env.rewards[0], -1.0f, 0.0001f); + ASSERT_TRUE(env.chirps_emitted == 0); + + free_allocated(&env); + return 0; +} + +static int test_timer_observation_tracks_elapsed_fraction(void) { + Bat env = make_test_env(); + c_reset(&env); + + ASSERT_TRUE(OBS_SIZE == 41); + ASSERT_FLOAT_NEAR(env.observations[TIMER_OBS], 0.0f, 0.0001f); + + env.actions[0] = NOOP; + env.actions[1] = TURN_NONE; + env.actions[5] = 0.0f; + c_step(&env); + + ASSERT_FLOAT_NEAR(env.observations[TIMER_OBS], 1.0f / (float)MAX_STEPS, 0.0001f); + + env.tick = MAX_STEPS / 2; + compute_observations(&env); + ASSERT_FLOAT_NEAR(env.observations[TIMER_OBS], 0.5f, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_timeout_terminates_with_minus_one_reward(void) { + Bat env = make_test_env(); + env.num_obstacles = 0; + env.progress_reward_scale = 0.0f; + env.step_cost = 0.0f; + c_reset(&env); + env.tick = MAX_STEPS - 1; + + env.actions[0] = NOOP; + env.actions[1] = TURN_NONE; + env.actions[5] = 0.0f; + c_step(&env); + + ASSERT_TRUE(env.terminals[0] == 1.0f); + ASSERT_FLOAT_NEAR(env.rewards[0], -1.0f, 0.0001f); + ASSERT_FLOAT_NEAR(env.log.timeout, 1.0f, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_chirp_efficiency_scores_low_usage_above_full_budget(void) { + Bat env = make_test_env(); + c_reset(&env); + + env.chirps_emitted = 1; + ASSERT_FLOAT_NEAR(chirp_efficiency(&env), 0.9666667f, 0.0001f); + + env.chirps_emitted = MAX_CHIRPS_PER_EPISODE; + ASSERT_FLOAT_NEAR(chirp_efficiency(&env), 0.50f, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_chirp_perf_uses_fixed_fifteen_chirp_reference(void) { + Bat env = make_test_env(); + c_reset(&env); + + env.chirps_emitted = 0; + ASSERT_FLOAT_NEAR(chirp_perf(&env), 1.0f, 0.0001f); + + env.chirps_emitted = 6; + ASSERT_FLOAT_NEAR(chirp_perf(&env), 0.60f, 0.0001f); + + env.chirps_emitted = 8; + ASSERT_FLOAT_NEAR(chirp_perf(&env), 0.4666667f, 0.0001f); + + env.chirps_emitted = 15; + ASSERT_FLOAT_NEAR(chirp_perf(&env), 0.05f, 0.0001f); + + env.chirps_emitted = 30; + ASSERT_FLOAT_NEAR(chirp_perf(&env), 0.05f, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_success_reward_includes_chirp_efficiency_bonus(void) { + Bat env = make_test_env(); + env.chirp_efficiency_reward = 1.0f; + c_reset(&env); + + env.chirps_emitted = 2; + env.x = 20.0f; + env.y = 20.0f; + env.bug_x = 20.5f; + env.bug_y = 20.0f; + + c_step(&env); + + ASSERT_FLOAT_NEAR(env.terminals[0], 1.0f, 0.0001f); + ASSERT_FLOAT_NEAR(env.rewards[0], 0.9333333f, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_curriculum_perf_uses_distance_and_obstacle_difficulty(void) { + Bat env = make_test_env(); + c_reset(&env); + + env.curriculum_start_bug_distance = 8.0f; + env.num_obstacles = 2; + env.start_bug_dist = 32.0f; + + ASSERT_FLOAT_NEAR(curriculum_distance_difficulty(&env), 0.5000000f, 0.0001f); + ASSERT_FLOAT_NEAR(curriculum_obstacle_difficulty(&env), 0.6666667f, 0.0001f); + ASSERT_FLOAT_NEAR(curriculum_motion_difficulty(&env), 0.0000000f, 0.0001f); + ASSERT_FLOAT_NEAR(curriculum_difficulty(&env), 0.3888889f, 0.0001f); + add_log(&env, 1.0f, 0.0f, 0.0f); + ASSERT_FLOAT_NEAR(env.log.base_perf, 1.0f, 0.0001f); + ASSERT_FLOAT_NEAR(env.log.curriculum_difficulty, 0.3888889f, 0.0001f); + ASSERT_FLOAT_NEAR(env.log.curriculum_perf, 0.3888889f, 0.0001f); + ASSERT_FLOAT_NEAR(env.log.num_obstacles, 2.0f, 0.0001f); + + memset(&env.log, 0, sizeof(env.log)); + add_log(&env, 0.0f, 1.0f, 0.0f); + ASSERT_FLOAT_NEAR(env.log.base_perf, 0.0f, 0.0001f); + ASSERT_FLOAT_NEAR(env.log.curriculum_difficulty, 0.3888889f, 0.0001f); + ASSERT_FLOAT_NEAR(env.log.curriculum_perf, 0.0f, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_perf_composes_base_perf_curriculum_difficulty_and_chirp_perf(void) { + Bat env = make_test_env(); + c_reset(&env); + + env.curriculum_start_bug_distance = 8.0f; + env.num_obstacles = 2; + env.chirps_emitted = 7; + env.start_bug_dist = 32.0f; + + add_log(&env, 1.0f, 0.0f, 0.0f); + + ASSERT_FLOAT_NEAR(env.log.base_perf, 1.0f, 0.0001f); + ASSERT_FLOAT_NEAR(env.log.chirp_perf, 0.5333334f, 0.0001f); + ASSERT_FLOAT_NEAR(env.log.curriculum_difficulty, 0.3888889f, 0.0001f); + ASSERT_FLOAT_NEAR(env.log.perf, 0.2074074f, 0.0001f); + + memset(&env.log, 0, sizeof(env.log)); + add_log(&env, 0.0f, 1.0f, 0.0f); + ASSERT_FLOAT_NEAR(env.log.base_perf, 0.0f, 0.0001f); + ASSERT_FLOAT_NEAR(env.log.perf, 0.0f, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_left_right_echo_asymmetry(void) { + Bat env = make_test_env(); + c_reset(&env); + + env.x = 20.0f; + env.y = 20.0f; + env.heading = 0.0f; + env.bug_x = 35.0f; + env.bug_y = 10.0f; + env.bug_vx = 0.0f; + env.bug_vy = 0.0f; + clear_echo_queue(&env); + env.tick = 0; + + ChirpEvent chirp = { + .x = env.x, + .y = env.y, + .start_freq = 1.0f, + .end_freq = 1.0f, + .duration = chirp_duration_seconds(0.0f), + .birth_tick = 0, + .active = 1, + }; + schedule_echo(&env, &chirp, 0.0f, 1.0f, + env.bug_x, env.bug_y, env.bug_vx, env.bug_vy, 8.0f, ECHO_BUG); + + float left_energy = 0.0f; + float right_energy = 0.0f; + for (int i = 0; i < ECHO_QUEUE_TICKS; i++) { + if (env.echo_queue[i].tick < 0) continue; + for (int bin = 0; bin < FREQ_BINS; bin++) { + left_energy += env.echo_queue[i].energy[0][bin]; + right_energy += env.echo_queue[i].energy[1][bin]; + } + } + + ASSERT_TRUE(left_energy > right_energy); + + free_allocated(&env); + return 0; +} + +typedef struct BatEchoProbe { + float left_energy; + float right_energy; + float left_tick; + float right_tick; +} BatEchoProbe; + +static BatEchoProbe test_probe_echo_from_relative_source(float dx, float dy) { + Bat env = make_test_env(); + c_reset(&env); + + env.x = 24.0f; + env.y = 24.0f; + env.vx = 0.0f; + env.vy = 0.0f; + env.heading = 0.0f; + env.sound_speed = 40.0f; + env.ear_separation_scale = 2.0f; + env.ear_rear_gain = 0.20f; + env.ear_front_gain = 0.55f; + env.ear_side_gain = 0.35f; + env.tick = 0; + clear_echo_queue(&env); + + ChirpEvent chirp = { + .x = env.x, + .y = env.y, + .start_freq = 0.5f, + .end_freq = 0.5f, + .duration = chirp_duration_seconds(0.0f), + .birth_tick = 0, + .active = 1, + }; + schedule_echo(&env, &chirp, 0.0f, 0.5f, + env.x + dx, env.y + dy, 0.0f, 0.0f, 8.0f, ECHO_BUG); + + BatEchoProbe probe = { + .left_tick = -1.0f, + .right_tick = -1.0f, + }; + for (int i = 0; i < ECHO_QUEUE_TICKS; i++) { + if (env.echo_queue[i].tick < 0) continue; + float left_energy = 0.0f; + float right_energy = 0.0f; + for (int bin = 0; bin < FREQ_BINS; bin++) { + left_energy += env.echo_queue[i].energy[0][bin]; + right_energy += env.echo_queue[i].energy[1][bin]; + } + if (left_energy > 0.0f) { + probe.left_energy += left_energy; + probe.left_tick = env.echo_queue[i].tick; + } + if (right_energy > 0.0f) { + probe.right_energy += right_energy; + probe.right_tick = env.echo_queue[i].tick; + } + } + + free_allocated(&env); + return probe; +} + +static int test_directional_echo_arrival_and_gain_by_side(void) { + const float left_sources[3][2] = { + {0.0f, -18.0f}, + {18.0f, -18.0f}, + {24.0f, -8.0f}, + }; + const float right_sources[3][2] = { + {0.0f, 18.0f}, + {18.0f, 18.0f}, + {24.0f, 8.0f}, + }; + + for (int i = 0; i < 3; i++) { + BatEchoProbe left = test_probe_echo_from_relative_source( + left_sources[i][0], left_sources[i][1]); + ASSERT_TRUE(left.left_tick > 0.0f); + ASSERT_TRUE(left.right_tick > 0.0f); + ASSERT_TRUE(left.left_tick < left.right_tick); + ASSERT_TRUE(left.left_energy > left.right_energy); + + BatEchoProbe right = test_probe_echo_from_relative_source( + right_sources[i][0], right_sources[i][1]); + ASSERT_TRUE(right.left_tick > 0.0f); + ASSERT_TRUE(right.right_tick > 0.0f); + ASSERT_TRUE(right.right_tick < right.left_tick); + ASSERT_TRUE(right.right_energy > right.left_energy); + } + + BatEchoProbe front = test_probe_echo_from_relative_source(18.0f, 0.0f); + ASSERT_TRUE(front.left_tick > 0.0f); + ASSERT_TRUE(front.right_tick > 0.0f); + ASSERT_FLOAT_NEAR(front.left_tick, front.right_tick, 0.0001f); + ASSERT_FLOAT_NEAR(front.left_energy, front.right_energy, 0.0001f); + + return 0; +} + +static int test_ear_directivity_gains_control_echo_energy(void) { + Bat env = make_test_env(); + c_reset(&env); + + env.x = 20.0f; + env.y = 20.0f; + env.heading = 0.0f; + env.bug_vx = 0.0f; + env.bug_vy = 0.0f; + env.ear_rear_gain = 0.0f; + env.ear_front_gain = 1.0f; + env.ear_side_gain = 0.0f; + env.tick = 0; + + ChirpEvent chirp = { + .x = env.x, + .y = env.y, + .start_freq = 1.0f, + .end_freq = 1.0f, + .duration = chirp_duration_seconds(0.0f), + .birth_tick = 0, + .active = 1, + }; + + clear_echo_queue(&env); + schedule_echo(&env, &chirp, 0.0f, 1.0f, + env.x + 16.0f, env.y, 0.0f, 0.0f, 8.0f, ECHO_BUG); + float front_energy = 0.0f; + for (int i = 0; i < ECHO_QUEUE_TICKS; i++) { + for (int ear = 0; ear < 2; ear++) { + for (int bin = 0; bin < FREQ_BINS; bin++) { + front_energy += env.echo_queue[i].energy[ear][bin]; + } + } + } + + clear_echo_queue(&env); + schedule_echo(&env, &chirp, 0.0f, 1.0f, + env.x, env.y - 16.0f, 0.0f, 0.0f, 8.0f, ECHO_BUG); + float side_energy = 0.0f; + for (int i = 0; i < ECHO_QUEUE_TICKS; i++) { + for (int ear = 0; ear < 2; ear++) { + for (int bin = 0; bin < FREQ_BINS; bin++) { + side_energy += env.echo_queue[i].energy[ear][bin]; + } + } + } + + ASSERT_TRUE(front_energy > 0.0f); + ASSERT_FLOAT_NEAR(side_energy, 0.0f, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_default_sound_speed_allows_one_tick_interaural_delay(void) { + Bat env = { + .num_agents = 1, + .num_obstacles = 0, + .ear_separation_scale = 0.75f, + .ear_rear_gain = 0.20f, + .ear_front_gain = 0.55f, + .ear_side_gain = 0.35f, + .max_speed = 12.0f, + .accel = 30.0f, + .turn_rate = 3.1415926f, + .sound_speed = 60.0f, + .rng = 1, + }; + allocate(&env); + + env.x = 20.0f; + env.y = 20.0f; + env.vx = 0.0f; + env.vy = 0.0f; + env.heading = 0.0f; + env.tick = 0; + clear_echo_queue(&env); + + ChirpEvent chirp = { + .x = env.x, + .y = env.y, + .start_freq = 0.5f, + .end_freq = 0.5f, + .duration = chirp_duration_seconds(0.0f), + .birth_tick = 0, + .active = 1, + }; + schedule_echo(&env, &chirp, 0.0f, 0.5f, + env.x, env.y - 12.0f, 0.0f, 0.0f, 8.0f, ECHO_BUG); + + float left_tick = -1.0f; + float right_tick = -1.0f; + for (int i = 0; i < ECHO_QUEUE_TICKS; i++) { + if (env.echo_queue[i].tick < 0) continue; + float left_energy = 0.0f; + float right_energy = 0.0f; + for (int bin = 0; bin < FREQ_BINS; bin++) { + left_energy += env.echo_queue[i].energy[0][bin]; + right_energy += env.echo_queue[i].energy[1][bin]; + } + if (left_energy > 0.0f) left_tick = env.echo_queue[i].tick; + if (right_energy > 0.0f) right_tick = env.echo_queue[i].tick; + } + + ASSERT_TRUE(left_tick > 0.0f); + ASSERT_TRUE(right_tick > 0.0f); + ASSERT_TRUE(fabsf(left_tick - right_tick) >= 1.0f); + + free_allocated(&env); + return 0; +} + +static int test_echo_scheduling_uses_tick_bucket_accumulator(void) { + Bat env = make_test_env(); + c_reset(&env); + + clear_echo_queue(&env); + env.tick = 7; + add_echo_event(&env, 0, 9.25f, 1.0f, 0.4f, 18.0f, ECHO_BUG); + add_echo_event(&env, 0, 9.75f, 1.0f, 0.7f, 12.0f, ECHO_BUG); + + int slot = 10 % ECHO_QUEUE_TICKS; + ASSERT_TRUE(env.echo_queue[slot].tick == 10); + ASSERT_FLOAT_NEAR(env.echo_queue[slot].energy[0][FREQ_BINS - 1], 1.1f, 0.0001f); + ASSERT_FLOAT_NEAR(env.echo_queue[slot].closest_bug_echo_path, 12.0f, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_bug_wing_sidebands_spill_adjacent_bins_without_reward_inflation(void) { + Bat env = make_test_env(); + c_reset(&env); + + env.tick = 0; + env.bug_wing_sideband_gain = 0.25f; + clear_echo_queue(&env); + + int bin = (int)(0.5f * FREQ_BINS); + add_echo_event(&env, 0, 1.0f, 0.5f, 0.4f, 12.0f, ECHO_BUG); + EchoBucket* bug_bucket = &env.echo_queue[1 % ECHO_QUEUE_TICKS]; + ASSERT_FLOAT_NEAR(bug_bucket->energy[0][bin], 0.4f, 0.0001f); + ASSERT_FLOAT_NEAR(bug_bucket->energy[0][bin - 1], 0.1f, 0.0001f); + ASSERT_FLOAT_NEAR(bug_bucket->energy[0][bin + 1], 0.1f, 0.0001f); + ASSERT_FLOAT_NEAR(bug_bucket->closest_bug_echo_path, 12.0f, 0.0001f); + + clear_echo_queue(&env); + add_echo_event(&env, 0, 1.0f, 0.5f, 0.4f, 12.0f, ECHO_STATIC); + EchoBucket* static_bucket = &env.echo_queue[1 % ECHO_QUEUE_TICKS]; + ASSERT_FLOAT_NEAR(static_bucket->energy[0][bin], 0.4f, 0.0001f); + ASSERT_FLOAT_NEAR(static_bucket->energy[0][bin - 1], 0.0f, 0.0001f); + ASSERT_FLOAT_NEAR(static_bucket->energy[0][bin + 1], 0.0f, 0.0001f); + ASSERT_FLOAT_NEAR(static_bucket->closest_bug_echo_path, -1.0f, 0.0001f); + + free_allocated(&env); + return 0; +} + +static float test_side_echo_receive_tick_gap(float ear_separation_scale) { + Bat env = make_test_env(); + c_reset(&env); + + env.ear_separation_scale = ear_separation_scale; + env.x = 20.0f; + env.y = 20.0f; + env.vx = 0.0f; + env.vy = 0.0f; + env.heading = 0.0f; + env.tick = 0; + clear_echo_queue(&env); + + ChirpEvent chirp = { + .x = env.x, + .y = env.y, + .start_freq = 0.5f, + .end_freq = 0.5f, + .duration = chirp_duration_seconds(0.0f), + .birth_tick = 0, + .active = 1, + }; + schedule_echo(&env, &chirp, 0.0f, 0.5f, + env.x, env.y - 12.0f, 0.0f, 0.0f, 8.0f, ECHO_BUG); + + float left_tick = -1.0f; + float right_tick = -1.0f; + for (int i = 0; i < ECHO_QUEUE_TICKS; i++) { + if (env.echo_queue[i].tick < 0) continue; + float left_energy = 0.0f; + float right_energy = 0.0f; + for (int bin = 0; bin < FREQ_BINS; bin++) { + left_energy += env.echo_queue[i].energy[0][bin]; + right_energy += env.echo_queue[i].energy[1][bin]; + } + if (left_energy > 0.0f) left_tick = env.echo_queue[i].tick; + if (right_energy > 0.0f) right_tick = env.echo_queue[i].tick; + } + + ASSERT_TRUE(left_tick > 0.0f); + ASSERT_TRUE(right_tick > 0.0f); + float gap = fabsf(left_tick - right_tick); + + free_allocated(&env); + return gap; +} + +static int test_ear_separation_scale_controls_arrival_gap(void) { + float narrow_gap = test_side_echo_receive_tick_gap(0.75f); + float wide_gap = test_side_echo_receive_tick_gap(1.50f); + + ASSERT_TRUE(narrow_gap > 0.0f); + ASSERT_TRUE(wide_gap > narrow_gap * 1.75f); + + return 0; +} + +static int test_doppler_sign_for_approaching_bug(void) { + Bat env = make_test_env(); + c_reset(&env); + + env.x = 20.0f; + env.y = 20.0f; + env.vx = 0.0f; + env.vy = 0.0f; + env.bug_x = 42.0f; + env.bug_y = 20.0f; + env.bug_vx = -16.0f; + env.bug_vy = 0.0f; + env.heading = 0.0f; + memset(env.observations, 0, OBS_SIZE * sizeof(float)); + clear_echo_queue(&env); + env.tick = 0; + + ChirpEvent chirp = { + .x = env.x, + .y = env.y, + .start_freq = 0.5f, + .end_freq = 0.5f, + .duration = chirp_duration_seconds(0.0f), + .birth_tick = 0, + .active = 1, + }; + schedule_echo(&env, &chirp, 0.0f, 0.5f, + env.bug_x, env.bug_y, env.bug_vx, env.bug_vy, 8.0f, ECHO_BUG); + + env.tick = 27; + compute_observations(&env); + + float low_energy = 0.0f; + float high_energy = 0.0f; + for (int i = 0; i < FREQ_BINS; i++) { + float energy = env.observations[LEFT_FREQ_OFFSET + i] + + env.observations[RIGHT_FREQ_OFFSET + i]; + if (i < FREQ_BINS / 2) { + low_energy += energy; + } else { + high_energy += energy; + } + } + + ASSERT_TRUE(high_energy > low_energy); + + free_allocated(&env); + return 0; +} + +static int test_wall_collision_is_terminal_minus_one(void) { + Bat env = make_test_env(); + c_reset(&env); + + env.x = ARENA_WIDTH - AGENT_RADIUS - 0.1f; + env.y = ARENA_HEIGHT * 0.5f; + env.heading = 0.0f; + env.vx = env.max_speed; + env.vy = 0.0f; + env.actions[0] = 1.0f; + env.actions[1] = 0.0f; + env.actions[2] = 0.0f; + env.actions[3] = 7.0f; + env.actions[4] = 1.0f; + env.actions[5] = 0.0f; + + c_step(&env); + + ASSERT_FLOAT_NEAR(env.terminals[0], 1.0f, 0.0001f); + ASSERT_FLOAT_NEAR(env.rewards[0], -1.0f, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_catch_bug_is_terminal_plus_one(void) { + Bat env = make_test_env(); + c_reset(&env); + + env.x = 20.0f; + env.y = 20.0f; + env.bug_x = 20.5f; + env.bug_y = 20.0f; + + c_step(&env); + + ASSERT_FLOAT_NEAR(env.terminals[0], 1.0f, 0.0001f); + ASSERT_TRUE(env.rewards[0] > 0.9f); + + free_allocated(&env); + return 0; +} + +static int test_progress_reward_sign(void) { + Bat env = make_test_env(); + c_reset(&env); + + env.x = 20.0f; + env.y = 20.0f; + env.bug_x = 40.0f; + env.bug_y = 20.0f; + env.prev_bug_dist = 25.0f; + env.vx = 0.0f; + env.vy = 0.0f; + + env.actions[0] = 1.0f; + env.actions[1] = 0.0f; + env.actions[2] = 0.0f; + env.actions[3] = 7.0f; + env.actions[4] = 1.0f; + env.actions[5] = 0.0f; + c_step(&env); + + ASSERT_TRUE(env.rewards[0] > 0.0f); + + free_allocated(&env); + return 0; +} + +static int test_bat_cannot_accelerate_backward_from_brake(void) { + Bat env = make_test_env(); + c_reset(&env); + + env.step_cost = 0.0f; + env.progress_reward_scale = 0.0f; + env.x = 20.0f; + env.y = 20.0f; + env.bug_x = 50.0f; + env.bug_y = 50.0f; + env.heading = 0.0f; + env.vx = 0.0f; + env.vy = 0.0f; + env.actions[0] = BRAKE; + env.actions[1] = TURN_NONE; + env.actions[2] = 0.0f; + env.actions[3] = 7.0f; + env.actions[4] = 1.0f; + env.actions[5] = 0.0f; + + c_step(&env); + + float forward = env.vx * cosf(env.heading) + env.vy * sinf(env.heading); + ASSERT_TRUE(forward >= -0.0001f); + ASSERT_TRUE(env.observations[FORWARD_SPEED_OBS] >= -0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_bat_reset_starts_with_forward_stall_speed(void) { + Bat env = make_test_env(); + c_reset(&env); + + float forward = env.vx * cosf(env.heading) + env.vy * sinf(env.heading); + ASSERT_TRUE(forward >= 0.19f * env.max_speed); + ASSERT_FLOAT_NEAR(env.observations[FORWARD_SPEED_OBS], forward / env.max_speed, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_bat_brake_clamps_to_forward_stall_speed(void) { + Bat env = make_test_env(); + c_reset(&env); + + env.step_cost = 0.0f; + env.progress_reward_scale = 0.0f; + env.x = 20.0f; + env.y = 20.0f; + env.bug_x = 50.0f; + env.bug_y = 50.0f; + env.heading = 0.0f; + env.vx = 0.0f; + env.vy = 0.0f; + env.actions[0] = BRAKE; + env.actions[1] = TURN_NONE; + env.actions[2] = 0.0f; + env.actions[3] = 7.0f; + env.actions[4] = 1.0f; + env.actions[5] = 0.0f; + + c_step(&env); + + float forward = env.vx * cosf(env.heading) + env.vy * sinf(env.heading); + ASSERT_TRUE(forward >= 0.19f * env.max_speed); + ASSERT_TRUE(env.x > 20.0f); + + free_allocated(&env); + return 0; +} + +static int test_bat_velocity_is_locked_to_heading(void) { + Bat env = make_test_env(); + c_reset(&env); + + env.step_cost = 0.0f; + env.progress_reward_scale = 0.0f; + env.x = 20.0f; + env.y = 20.0f; + env.bug_x = 50.0f; + env.bug_y = 50.0f; + env.heading = 0.0f; + env.vx = -env.max_speed * 0.5f; + env.vy = 3.0f; + env.actions[0] = NOOP; + env.actions[1] = TURN_NONE; + env.actions[2] = 0.0f; + env.actions[3] = 7.0f; + env.actions[4] = 1.0f; + env.actions[5] = 0.0f; + + c_step(&env); + + float forward = env.vx * cosf(env.heading) + env.vy * sinf(env.heading); + float lateral = env.vx * -sinf(env.heading) + env.vy * cosf(env.heading); + ASSERT_TRUE(forward >= -0.0001f); + ASSERT_FLOAT_NEAR(lateral, 0.0f, 0.0001f); + ASSERT_TRUE(env.observations[FORWARD_SPEED_OBS] >= -0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_bat_zero_speed_recovers_to_forward_arc(void) { + Bat env = make_test_env(); + c_reset(&env); + + env.step_cost = 0.0f; + env.progress_reward_scale = 0.0f; + env.x = 20.0f; + env.y = 20.0f; + env.bug_x = 50.0f; + env.bug_y = 50.0f; + env.heading = 0.25f; + env.vx = 0.0f; + env.vy = 0.0f; + env.actions[0] = NOOP; + env.actions[1] = TURN_LEFT; + env.actions[2] = 0.0f; + env.actions[3] = 7.0f; + env.actions[4] = 1.0f; + env.actions[5] = 0.0f; + + float start_x = env.x; + float start_y = env.y; + c_step(&env); + + float forward = env.vx * cosf(env.heading) + env.vy * sinf(env.heading); + ASSERT_TRUE(forward >= 0.19f * env.max_speed); + ASSERT_TRUE(dist(start_x, start_y, env.x, env.y) > 0.0f); + ASSERT_TRUE(fabsf(env.heading - 0.25f) > 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_bat_turn_rate_scales_with_forward_speed(void) { + Bat env = make_test_env(); + c_reset(&env); + + env.step_cost = 0.0f; + env.progress_reward_scale = 0.0f; + env.x = 20.0f; + env.y = 20.0f; + env.bug_x = 50.0f; + env.bug_y = 50.0f; + env.heading = 0.0f; + env.vx = env.max_speed * 0.5f; + env.vy = 0.0f; + env.actions[0] = NOOP; + env.actions[1] = TURN_RIGHT; + env.actions[2] = 0.0f; + env.actions[3] = 7.0f; + env.actions[4] = 1.0f; + env.actions[5] = 0.0f; + + c_step(&env); + + ASSERT_FLOAT_NEAR(env.turn_velocity, env.turn_rate * 0.5f, 0.0001f); + ASSERT_FLOAT_NEAR(env.heading, env.turn_rate * 0.5f * TICK_RATE, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_bat_speed_action_space_has_no_strafe(void) { + ASSERT_TRUE(MOVE_ACTIONS == 3); + ASSERT_TRUE(NOOP == 0); + ASSERT_TRUE(THRUST_FORWARD == 1); + ASSERT_TRUE(BRAKE == 2); + return 0; +} + +static int test_chirp_audio_maps_norm_freq_to_audible_sweep(void) { + ASSERT_FLOAT_NEAR(chirp_audio_frequency_hz(0.0f), 600.0f, 0.0001f); + ASSERT_FLOAT_NEAR(chirp_audio_frequency_hz(1.0f), 3600.0f, 0.0001f); + ASSERT_FLOAT_NEAR(chirp_audio_sample_f32(0.0f, 1.0f, 0.20f, -1, 48000), 0.0f, 0.0001f); + ASSERT_FLOAT_NEAR(chirp_audio_sample_f32(0.0f, 1.0f, 0.20f, 9600, 48000), 0.0f, 0.0001f); + float sample = chirp_audio_sample_f32(0.0f, 1.0f, 0.20f, 2400, 48000); + ASSERT_TRUE(sample >= -0.25f); + ASSERT_TRUE(sample <= 0.25f); + return 0; +} + +static int test_chirp_audio_duration_scales_with_render_fps(void) { + Bat env = make_test_env(); + float base_duration = chirp_duration_seconds(0.0f); + env.render_target_fps = 60; + ASSERT_FLOAT_NEAR(chirp_audio_duration_seconds(&env, 0.0f), base_duration, 0.0001f); + env.render_target_fps = 30; + ASSERT_FLOAT_NEAR(chirp_audio_duration_seconds(&env, 0.0f), base_duration * 2.0f, 0.0001f); + env.render_target_fps = 15; + ASSERT_FLOAT_NEAR(chirp_audio_duration_seconds(&env, 0.0f), base_duration * 4.0f, 0.0001f); + free_allocated(&env); + return 0; +} + +static int test_chirp_cooldown_accepts_only_after_delay(void) { + Bat env = make_test_env(); + c_reset(&env); + env.chirp_cooldown_ticks = 12; + + env.actions[2] = 0.0f; + env.actions[3] = 7.0f; + env.actions[4] = 1.0f; + env.actions[5] = 1.0f; + ASSERT_TRUE(try_emit_chirp(&env)); + ASSERT_TRUE(!try_emit_chirp(&env)); + + env.tick += 12; + ASSERT_TRUE(try_emit_chirp(&env)); + + free_allocated(&env); + return 0; +} + +static void test_place_safe_stationary_scene(Bat* env) { + env->num_obstacles = 0; + env->x = 20.0f; + env->y = 20.0f; + env->vx = 0.0f; + env->vy = 0.0f; + env->heading = 0.0f; + env->bug_x = 48.0f; + env->bug_y = 48.0f; + env->bug_vx = 0.0f; + env->bug_vy = 0.0f; + env->prev_bug_dist = dist(env->x, env->y, env->bug_x, env->bug_y); +} + +static void test_set_emit_chirp_action(Bat* env) { + env->actions[0] = NOOP; + env->actions[1] = TURN_NONE; + env->actions[2] = 0.0f; + env->actions[3] = 7.0f; + env->actions[4] = 1.0f; + env->actions[5] = 1.0f; +} + +static int test_valid_chirp_gets_reward(void) { + Bat env = make_test_env(); + c_reset(&env); + test_place_safe_stationary_scene(&env); + env.step_cost = 0.0f; + env.progress_reward_scale = 0.0f; + env.bug_echo_reward_scale = 0.0f; + env.valid_chirp_reward = 0.0005f; + env.early_chirp_penalty = 0.0020f; + test_set_emit_chirp_action(&env); + + c_step(&env); + + ASSERT_FLOAT_NEAR(env.terminals[0], 0.0f, 0.0001f); + ASSERT_FLOAT_NEAR(env.rewards[0], env.valid_chirp_reward, 0.0001f); + ASSERT_TRUE(env.chirps_emitted == 1); + + free_allocated(&env); + return 0; +} + +static int test_early_chirp_gets_penalty_and_emits_nothing(void) { + Bat env = make_test_env(); + c_reset(&env); + test_place_safe_stationary_scene(&env); + env.step_cost = 0.0f; + env.progress_reward_scale = 0.0f; + env.bug_echo_reward_scale = 0.0f; + env.valid_chirp_reward = 0.0005f; + env.early_chirp_penalty = 0.0020f; + env.chirp_cooldown_ticks = 12; + test_set_emit_chirp_action(&env); + c_step(&env); + test_place_safe_stationary_scene(&env); + test_set_emit_chirp_action(&env); + + c_step(&env); + + ASSERT_FLOAT_NEAR(env.terminals[0], 0.0f, 0.0001f); + ASSERT_FLOAT_NEAR(env.rewards[0], -env.early_chirp_penalty, 0.0001f); + ASSERT_TRUE(env.chirps_emitted == 1); + + free_allocated(&env); + return 0; +} + +static int test_chirp_before_bug_echo_arrives_gets_scaled_overlap_penalty(void) { + Bat env = make_test_env(); + c_reset(&env); + test_place_safe_stationary_scene(&env); + env.step_cost = 0.0f; + env.progress_reward_scale = 0.0f; + env.bug_echo_reward_scale = 0.0f; + env.valid_chirp_reward = 0.0005f; + env.early_chirp_penalty = 0.0020f; + env.chirp_overlap_penalty = 0.0040f; + env.chirp_cooldown_ticks = 1; + test_set_emit_chirp_action(&env); + + c_step(&env); + + ASSERT_FLOAT_NEAR(env.terminals[0], 0.0f, 0.0001f); + ASSERT_FLOAT_NEAR(env.rewards[0], env.valid_chirp_reward, 0.0001f); + ASSERT_TRUE(env.chirps_emitted == 1); + + env.last_chirp_tick = 0; + env.last_bug_echo_expected_tick = 10.0f; + env.tick = 5; + test_place_safe_stationary_scene(&env); + test_set_emit_chirp_action(&env); + c_step(&env); + + ASSERT_FLOAT_NEAR(env.terminals[0], 0.0f, 0.0001f); + ASSERT_FLOAT_NEAR(env.rewards[0], + env.valid_chirp_reward - 0.5f * env.chirp_overlap_penalty, 0.0001f); + ASSERT_TRUE(env.chirps_emitted == 2); + + free_allocated(&env); + return 0; +} + +static int test_chirp_after_bug_echo_arrives_ignores_static_echo_window(void) { + Bat env = make_test_env(); + c_reset(&env); + test_place_safe_stationary_scene(&env); + env.step_cost = 0.0f; + env.progress_reward_scale = 0.0f; + env.bug_echo_reward_scale = 0.0f; + env.valid_chirp_reward = 0.0005f; + env.chirp_overlap_penalty = 0.0040f; + env.chirp_cooldown_ticks = 1; + env.chirps_emitted = 1; + env.last_chirp_tick = 0; + env.last_bug_echo_expected_tick = 3.0f; + env.tick = 4; + test_set_emit_chirp_action(&env); + + c_step(&env); + + ASSERT_FLOAT_NEAR(env.terminals[0], 0.0f, 0.0001f); + ASSERT_FLOAT_NEAR(env.rewards[0], env.valid_chirp_reward, 0.0001f); + ASSERT_TRUE(env.chirps_emitted == 2); + + free_allocated(&env); + return 0; +} + +static int test_reflection_arrives_at_two_way_travel_time(void) { + float sound_speed = 100.0f; + float distance = 25.0f; + float echo_time = 2.0f * distance / sound_speed; + + ASSERT_FLOAT_NEAR(echo_time, 0.5f, 0.0001f); + ASSERT_TRUE(fabsf((echo_time + 0.005f) - echo_time) <= 0.02f); + ASSERT_TRUE(fabsf((echo_time + 0.050f) - echo_time) > 0.02f); + + return 0; +} + +static float test_sum_obs(Bat* env, int offset, int count) { + float sum = 0.0f; + for (int i = 0; i < count; i++) { + sum += env->observations[offset + i]; + } + return sum; +} + +static int test_bins_only_observation_layout(void) { + ASSERT_TRUE(OBS_SIZE == 41); + ASSERT_TRUE(FREQ_BINS == 16); + ASSERT_TRUE(LEFT_FREQ_OFFSET == 0); + ASSERT_TRUE(RIGHT_FREQ_OFFSET == 16); + ASSERT_TRUE(CHIRP_AGE_OBS == 32); + ASSERT_TRUE(CHIRP_COOLDOWN_OBS == 33); + ASSERT_TRUE(CHIRP_START_OBS == 34); + ASSERT_TRUE(CHIRP_END_OBS == 35); + ASSERT_TRUE(CHIRP_DURATION_OBS == 36); + ASSERT_TRUE(CHIRPS_USED_OBS == 37); + ASSERT_TRUE(FORWARD_SPEED_OBS == 38); + ASSERT_TRUE(TURN_RATE_OBS == 39); + ASSERT_TRUE(TIMER_OBS == 40); + return 0; +} + +static int test_no_chirp_produces_silent_frequency_bins(void) { + Bat env = make_test_env(); + c_reset(&env); + + ASSERT_FLOAT_NEAR(test_sum_obs(&env, LEFT_FREQ_OFFSET, FREQ_BINS), 0.0f, 0.0001f); + ASSERT_FLOAT_NEAR(test_sum_obs(&env, RIGHT_FREQ_OFFSET, FREQ_BINS), 0.0f, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_observations_stay_normalized_after_chirp(void) { + Bat env = make_test_env(); + c_reset(&env); + + ASSERT_FLOAT_NEAR(env.observations[CHIRP_AGE_OBS], 1.0f, 0.0001f); + for (int i = 0; i < OBS_SIZE; i++) { + ASSERT_TRUE(env.observations[i] >= -1.0f); + ASSERT_TRUE(env.observations[i] <= 1.0f); + } + + env.actions[0] = NOOP; + env.actions[1] = TURN_NONE; + env.actions[2] = 0.0f; + env.actions[3] = 7.0f; + env.actions[4] = 1.0f; + env.actions[5] = 1.0f; + c_step(&env); + + float age_denom = chirp_age_norm_denominator(&env); + ASSERT_FLOAT_NEAR(env.observations[CHIRP_AGE_OBS], 1.0f / age_denom, 0.0001f); + for (int i = 0; i < OBS_SIZE; i++) { + ASSERT_TRUE(env.observations[i] >= -1.0f); + ASSERT_TRUE(env.observations[i] <= 1.0f); + } + + free_allocated(&env); + return 0; +} + +static int test_curriculum_level_zero_starts_close_with_no_obstacles(void) { + Bat env = make_test_env(); + env.num_obstacles = 3; + env.curriculum_obstacle_step = 1; + env.curriculum_start_bug_distance = 12.0f; + c_reset(&env); + + ASSERT_TRUE(env.num_obstacles == 0); + ASSERT_TRUE(dist(env.x, env.y, env.bug_x, env.bug_y) <= 14.0f); + + free_allocated(&env); + return 0; +} + +static int test_curriculum_adds_first_obstacle_after_level_zero(void) { + Bat env = make_test_env(); + env.num_obstacles = 3; + env.curriculum_obstacle_step = 4; + + env.curriculum_initial_level = 1; + c_reset(&env); + ASSERT_TRUE(env.num_obstacles == 1); + + env.curriculum_initial_level = 5; + env.curriculum_level = 0; + c_reset(&env); + ASSERT_TRUE(env.num_obstacles == 2); + + env.curriculum_initial_level = 9; + env.curriculum_level = 0; + c_reset(&env); + ASSERT_TRUE(env.num_obstacles == 3); + + free_allocated(&env); + return 0; +} + +static int test_curriculum_advances_after_catch(void) { + Bat env = make_test_env(); + env.num_obstacles = 3; + env.curriculum_obstacle_step = 1; + env.curriculum_start_bug_distance = 12.0f; + c_reset(&env); + env.x = 20.0f; + env.y = 20.0f; + env.bug_x = 20.5f; + env.bug_y = 20.0f; + + c_step(&env); + + ASSERT_TRUE(env.curriculum_level == 1); + ASSERT_TRUE(env.num_obstacles == 1); + ASSERT_TRUE(dist(env.x, env.y, env.bug_x, env.bug_y) <= 16.0f); + + free_allocated(&env); + return 0; +} + +static int test_curriculum_waits_for_required_catches(void) { + Bat env = make_test_env(); + env.num_obstacles = 3; + env.curriculum_obstacle_step = 1; + env.curriculum_start_bug_distance = 12.0f; + env.curriculum_successes_per_level = 2; + c_reset(&env); + env.x = 20.0f; + env.y = 20.0f; + env.bug_x = 20.5f; + env.bug_y = 20.0f; + + c_step(&env); + + ASSERT_TRUE(env.curriculum_level == 0); + ASSERT_TRUE(env.curriculum_successes_at_level == 1); + + env.x = 20.0f; + env.y = 20.0f; + env.bug_x = 20.5f; + env.bug_y = 20.0f; + + c_step(&env); + + ASSERT_TRUE(env.curriculum_level == 1); + ASSERT_TRUE(env.curriculum_successes_at_level == 0); + + free_allocated(&env); + return 0; +} + +static int test_curriculum_initial_level_sets_first_reset_difficulty(void) { + Bat env = make_test_env(); + env.num_obstacles = 3; + env.curriculum_initial_level = 4; + env.curriculum_obstacle_step = 2; + env.curriculum_start_bug_distance = 8.0f; + c_reset(&env); + + ASSERT_TRUE(env.curriculum_level == 4); + ASSERT_TRUE(env.num_obstacles == 2); + float distance = dist(env.x, env.y, env.bug_x, env.bug_y); + ASSERT_TRUE(distance >= 15.0f); + ASSERT_TRUE(distance <= 17.0f); + + free_allocated(&env); + return 0; +} + +static int test_curriculum_initial_level_does_not_reset_progress(void) { + Bat env = make_test_env(); + env.num_obstacles = 3; + env.curriculum_initial_level = 2; + env.curriculum_obstacle_step = 1; + env.curriculum_successes_per_level = 1; + env.curriculum_start_bug_distance = 8.0f; + c_reset(&env); + env.x = 20.0f; + env.y = 20.0f; + env.bug_x = 20.5f; + env.bug_y = 20.0f; + + c_step(&env); + + ASSERT_TRUE(env.curriculum_level == 3); + ASSERT_TRUE(env.curriculum_successes_at_level == 0); + + free_allocated(&env); + return 0; +} + +static int test_bug_bounces_off_arena_walls(void) { + Bat env = make_test_env(); + c_reset(&env); + + env.bug_x = ARENA_WIDTH - BUG_RADIUS + 0.1f; + env.bug_y = ARENA_HEIGHT * 0.5f; + env.bug_vx = 3.0f; + env.bug_vy = 1.0f; + update_bug(&env, 0.0f); + ASSERT_TRUE(env.bug_x == ARENA_WIDTH - BUG_RADIUS); + ASSERT_TRUE(env.bug_vx < 0.0f); + ASSERT_TRUE(env.bug_vy == 1.0f); + + env.bug_x = ARENA_WIDTH * 0.5f; + env.bug_y = BUG_RADIUS - 0.1f; + env.bug_vx = 2.0f; + env.bug_vy = -4.0f; + update_bug(&env, 0.0f); + ASSERT_TRUE(env.bug_y == BUG_RADIUS); + ASSERT_TRUE(env.bug_vx == 2.0f); + ASSERT_TRUE(env.bug_vy > 0.0f); + + free_allocated(&env); + return 0; +} + +static int test_chirp_echo_arrives_after_two_way_travel_not_immediately(void) { + Bat env = make_test_env(); + env.num_obstacles = 0; + env.sound_speed = 60.0f; + c_reset(&env); + + env.x = 32.0f; + env.y = 32.0f; + env.vx = 0.0f; + env.vy = 0.0f; + env.heading = 0.0f; + env.bug_x = 38.0f; + env.bug_y = 32.0f; + env.bug_vx = 0.0f; + env.bug_vy = 0.0f; + compute_observations(&env); + + env.actions[0] = NOOP; + env.actions[1] = TURN_NONE; + env.actions[2] = 7; + env.actions[3] = 7; + env.actions[4] = 0; + env.actions[5] = 1; + c_step(&env); + + for (int i = 0; i < 6; i++) { + ASSERT_FLOAT_NEAR(test_sum_obs(&env, LEFT_FREQ_OFFSET, FREQ_BINS), 0.0f, 0.0001f); + ASSERT_FLOAT_NEAR(test_sum_obs(&env, RIGHT_FREQ_OFFSET, FREQ_BINS), 0.0f, 0.0001f); + env.actions[5] = 0; + c_step(&env); + } + + float max_energy = 0.0f; + for (int i = 0; i < 32; i++) { + float energy = test_sum_obs(&env, LEFT_FREQ_OFFSET, FREQ_BINS) + + test_sum_obs(&env, RIGHT_FREQ_OFFSET, FREQ_BINS); + if (energy > max_energy) max_energy = energy; + c_step(&env); + } + + ASSERT_TRUE(max_energy > 0.01f); + + free_allocated(&env); + return 0; +} + +static int test_default_echo_range_reaches_curriculum_max_bug_distance(void) { + Bat env = { + .num_agents = 1, + .num_obstacles = 0, + .max_speed = 22.0f, + .min_speed = 2.0f, + .accel = 45.0f, + .turn_rate = 9.424778f, + .ear_rear_gain = 0.20f, + .ear_front_gain = 0.55f, + .ear_side_gain = 0.35f, + .sound_speed = 180.0f, + .rng = 1, + }; + allocate(&env); + c_reset(&env); + + env.tick = 0; + env.x = 4.0f; + env.y = 32.0f; + env.vx = 0.0f; + env.vy = 0.0f; + env.heading = 0.0f; + env.bug_x = env.x + CURRICULUM_INBOUND_MAX_BUG_DISTANCE; + env.bug_y = env.y; + env.bug_vx = 0.0f; + env.bug_vy = 0.0f; + clear_echo_queue(&env); + + ChirpEvent chirp = { + .x = env.x, + .y = env.y, + .start_freq = 0.0f, + .end_freq = 1.0f, + .duration = chirp_duration_seconds(0.0f), + .birth_tick = env.tick, + .active = 1, + }; + chirp.slice_count = (int)ceilf(chirp.duration / TICK_RATE); + while (chirp.slices_scheduled < chirp.slice_count) { + int slice_idx = chirp.slices_scheduled; + schedule_chirp_slice_echoes(&env, &chirp, slice_idx); + chirp.slices_scheduled += 1; + } + + int bug_echo_buckets = 0; + for (int i = 0; i < ECHO_QUEUE_TICKS; i++) { + if (env.echo_queue[i].closest_bug_echo_path >= 0.0f) { + bug_echo_buckets += 1; + } + } + + ASSERT_TRUE(bug_echo_buckets > 0); + + free_allocated(&env); + return 0; +} + +static float test_sum_queued_echo_energy(Bat* env) { + float energy = 0.0f; + for (int i = 0; i < ECHO_QUEUE_TICKS; i++) { + for (int ear = 0; ear < 2; ear++) { + for (int bin = 0; bin < FREQ_BINS; bin++) { + energy += env->echo_queue[i].energy[ear][bin]; + } + } + } + return energy; +} + +static int test_corner_reflectors_enabled_schedule_stable_echo_events(void) { + Bat env = make_test_env(); + env.num_obstacles = 0; + env.sound_speed = 180.0f; + c_reset(&env); + + env.tick = 0; + env.x = 32.0f; + env.y = 32.0f; + env.heading = 0.0f; + env.vx = 0.0f; + env.vy = 0.0f; + clear_echo_queue(&env); + ChirpEvent chirp = { + .x = env.x, + .y = env.y, + .start_freq = 0.0f, + .end_freq = 1.0f, + .duration = chirp_duration_seconds(0.0f), + .birth_tick = env.tick, + .active = 1, + }; + + schedule_corner_reflector_echoes(&env, &chirp, 0.0f, 0.5f); + + ASSERT_TRUE(test_sum_queued_echo_energy(&env) > 0.0f); + + free_allocated(&env); + return 0; +} + +static int test_corner_reflector_echo_observations_stay_normalized(void) { + Bat env = make_test_env(); + env.num_obstacles = 0; + env.sound_speed = 180.0f; + c_reset(&env); + + env.tick = 0; + env.x = 32.0f; + env.y = 32.0f; + env.heading = 0.0f; + env.vx = 0.0f; + env.vy = 0.0f; + clear_echo_queue(&env); + ChirpEvent chirp = { + .x = env.x, + .y = env.y, + .start_freq = 0.0f, + .end_freq = 1.0f, + .duration = chirp_duration_seconds(0.0f), + .birth_tick = env.tick, + .active = 1, + }; + schedule_corner_reflector_echoes(&env, &chirp, 0.0f, 0.5f); + + int arrival_tick = -1; + for (int i = 0; i < ECHO_QUEUE_TICKS; i++) { + if (env.echo_queue[i].tick > 0 && test_sum_queued_echo_energy(&env) > 0.0f) { + arrival_tick = env.echo_queue[i].tick; + break; + } + } + ASSERT_TRUE(arrival_tick > 0); + + env.tick = arrival_tick; + compute_observations(&env); + ASSERT_TRUE(test_sum_obs(&env, LEFT_FREQ_OFFSET, FREQ_BINS) > 0.0f || + test_sum_obs(&env, RIGHT_FREQ_OFFSET, FREQ_BINS) > 0.0f); + for (int i = 0; i < OBS_SIZE; i++) { + ASSERT_TRUE(env.observations[i] >= -1.0f); + ASSERT_TRUE(env.observations[i] <= 1.0f); + } + + free_allocated(&env); + return 0; +} + +static int test_frequency_bin_energy_sums_and_caps(void) { + Bat env = make_test_env(); + memset(env.observations, 0, OBS_SIZE * sizeof(float)); + + int high_bin = FREQ_BINS - 1; + int low_bin = 0; + env.observations[LEFT_FREQ_OFFSET + high_bin] = bat_clampf( + env.observations[LEFT_FREQ_OFFSET + high_bin] + 0.75f, 0.0f, 1.0f); + env.observations[LEFT_FREQ_OFFSET + high_bin] = bat_clampf( + env.observations[LEFT_FREQ_OFFSET + high_bin] + 0.75f, 0.0f, 1.0f); + env.observations[RIGHT_FREQ_OFFSET + low_bin] = bat_clampf( + env.observations[RIGHT_FREQ_OFFSET + low_bin] + 0.35f, 0.0f, 1.0f); + + ASSERT_FLOAT_NEAR(env.observations[LEFT_FREQ_OFFSET + FREQ_BINS - 1], 1.0f, 0.0001f); + ASSERT_FLOAT_NEAR(env.observations[RIGHT_FREQ_OFFSET], 0.35f, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_bug_echo_reward_is_added_when_bug_echo_is_closer(void) { + Bat env = make_test_env(); + c_reset(&env); + env.bug_echo_reward_scale = 0.05f; + env.last_bug_echo_path = 20.0f; + env.last_bug_echo_x = 8.0f; + env.last_bug_echo_y = 10.0f; + env.step_cost = 0.0f; + env.progress_reward_scale = 0.0f; + env.x = 10.0f; + env.y = 10.0f; + env.vx = 0.0f; + env.vy = 0.0f; + env.bug_vx = 0.0f; + env.bug_vy = 0.0f; + env.bug_x = 50.0f; + env.bug_y = 50.0f; + clear_echo_queue(&env); + add_echo_event(&env, 0, 1.0f, 0.5f, 0.6f, 15.0f, ECHO_BUG); + + c_step(&env); + + ASSERT_TRUE(env.rewards[0] > 0.0015f); + ASSERT_FLOAT_NEAR(env.observations[LEFT_FREQ_OFFSET + 8], 0.6f, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_bug_echo_reward_requires_bat_displacement(void) { + Bat env = make_test_env(); + c_reset(&env); + env.bug_echo_reward_scale = 0.05f; + env.last_bug_echo_path = 20.0f; + env.last_bug_echo_x = 10.0f; + env.last_bug_echo_y = 10.0f; + env.step_cost = 0.0f; + env.progress_reward_scale = 0.0f; + env.x = 10.0f; + env.y = 10.0f; + env.heading = 0.0f; + env.vx = 0.0f; + env.vy = 0.0f; + env.bug_vx = 0.0f; + env.bug_vy = 0.0f; + env.bug_x = 50.0f; + env.bug_y = 50.0f; + clear_echo_queue(&env); + add_echo_event(&env, 0, 1.0f, 0.5f, 0.6f, 15.0f, ECHO_BUG); + + c_step(&env); + + ASSERT_FLOAT_NEAR(env.rewards[0], 0.0f, 0.0001f); + ASSERT_FLOAT_NEAR(env.last_bug_echo_path, 15.0f, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_bug_echo_reward_penalizes_farther_bug_echo_weakly(void) { + Bat env = make_test_env(); + c_reset(&env); + env.bug_echo_reward_scale = 0.05f; + env.last_bug_echo_path = 20.0f; + env.last_bug_echo_x = 8.0f; + env.last_bug_echo_y = 10.0f; + env.step_cost = 0.0f; + env.progress_reward_scale = 0.0f; + env.x = 10.0f; + env.y = 10.0f; + env.vx = 0.0f; + env.vy = 0.0f; + env.bug_vx = 0.0f; + env.bug_vy = 0.0f; + env.bug_x = 50.0f; + env.bug_y = 50.0f; + clear_echo_queue(&env); + add_echo_event(&env, 0, 1.0f, 0.5f, 0.6f, 25.0f, ECHO_BUG); + + c_step(&env); + + ASSERT_FLOAT_NEAR(env.rewards[0], -0.0001953f, 0.0001f); + ASSERT_FLOAT_NEAR(env.last_bug_echo_path, 25.0f, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_static_echo_does_not_get_bug_echo_reward(void) { + Bat env = make_test_env(); + c_reset(&env); + env.bug_echo_reward_scale = 0.05f; + env.step_cost = 0.0f; + env.progress_reward_scale = 0.0f; + env.x = 10.0f; + env.y = 10.0f; + env.vx = 0.0f; + env.vy = 0.0f; + env.bug_vx = 0.0f; + env.bug_vy = 0.0f; + env.bug_x = 50.0f; + env.bug_y = 50.0f; + clear_echo_queue(&env); + add_echo_event(&env, 0, 1.0f, 0.5f, 0.6f, 15.0f, ECHO_STATIC); + + c_step(&env); + + ASSERT_FLOAT_NEAR(env.rewards[0], 0.0f, 0.0001f); + ASSERT_FLOAT_NEAR(env.observations[LEFT_FREQ_OFFSET + 8], 0.6f, 0.0001f); + + free_allocated(&env); + return 0; +} + +static int test_spawns_use_curriculum_distance_with_random_positions(void) { + Bat env = make_test_env(); + float first_x = 0.0f; + float first_y = 0.0f; + float first_bug_x = 0.0f; + float first_bug_y = 0.0f; + float max_bat_delta = 0.0f; + float max_bug_delta = 0.0f; + + for (int i = 0; i < 48; i++) { + c_reset(&env); + ASSERT_FLOAT_NEAR(dist(env.x, env.y, env.bug_x, env.bug_y), + env.curriculum_start_bug_distance, 0.001f); + if (i == 0) { + first_x = env.x; + first_y = env.y; + first_bug_x = env.bug_x; + first_bug_y = env.bug_y; + } else { + float bat_delta = dist(first_x, first_y, env.x, env.y); + float bug_delta = dist(first_bug_x, first_bug_y, env.bug_x, env.bug_y); + if (bat_delta > max_bat_delta) max_bat_delta = bat_delta; + if (bug_delta > max_bug_delta) max_bug_delta = bug_delta; + } + } + + ASSERT_TRUE(max_bat_delta > 8.0f); + ASSERT_TRUE(max_bug_delta > 8.0f); + + free_allocated(&env); + return 0; +} + +static int test_spawns_keep_minimum_separation_and_avoid_obstacles(void) { + Bat env = make_test_env(); + env.curriculum_initial_level = 1; + float expected_distance = env.curriculum_start_bug_distance + CURRICULUM_BUG_DISTANCE_STEP; + + for (int reset = 0; reset < 32; reset++) { + c_reset(&env); + ASSERT_FLOAT_NEAR(dist(env.x, env.y, env.bug_x, env.bug_y), + expected_distance, 0.001f); + for (int i = 0; i < env.num_obstacles; i++) { + ASSERT_TRUE(!circle_rect_collision(env.x, env.y, AGENT_RADIUS + 1.0f, + env.obstacle_x[i], env.obstacle_y[i], env.obstacle_w[i], env.obstacle_h[i])); + ASSERT_TRUE(!circle_rect_collision(env.bug_x, env.bug_y, BUG_RADIUS + 1.0f, + env.obstacle_x[i], env.obstacle_y[i], env.obstacle_w[i], env.obstacle_h[i])); + } + } + + free_allocated(&env); + return 0; +} + +static int test_obstacles_move_substantially_across_resets(void) { + Bat env = make_test_env(); + env.curriculum_initial_level = 1; + c_reset(&env); + float first_x = env.obstacle_x[0]; + float first_y = env.obstacle_y[0]; + float max_delta = 0.0f; + + for (int i = 0; i < 32; i++) { + c_reset(&env); + float delta = dist(first_x, first_y, env.obstacle_x[0], env.obstacle_y[0]); + if (delta > max_delta) max_delta = delta; + } + + ASSERT_TRUE(max_delta > 16.0f); + + free_allocated(&env); + return 0; +} + +static int test_obstacles_are_small_enough_for_trainability(void) { + Bat env = make_test_env(); + env.curriculum_initial_level = 1; + + for (int reset = 0; reset < 64; reset++) { + c_reset(&env); + for (int i = 0; i < env.num_obstacles; i++) { + ASSERT_TRUE(env.obstacle_w[i] >= 3.0f); + ASSERT_TRUE(env.obstacle_h[i] >= 3.0f); + ASSERT_TRUE(env.obstacle_w[i] <= 8.0f); + ASSERT_TRUE(env.obstacle_h[i] <= 8.0f); + ASSERT_TRUE(env.obstacle_w[i] * env.obstacle_h[i] <= 64.0f); + } + } + + free_allocated(&env); + return 0; +} + +int main(void) { + if (test_chirp_metadata_and_observation_size()) return 1; + if (test_chirps_used_observation_tracks_emitted_chirps()) return 1; + if (test_max_chirps_stays_fixed_with_curriculum_level()) return 1; + if (test_chirping_after_budget_terminates_with_penalty()) return 1; + if (test_timer_observation_tracks_elapsed_fraction()) return 1; + if (test_timeout_terminates_with_minus_one_reward()) return 1; + if (test_chirp_efficiency_scores_low_usage_above_full_budget()) return 1; + if (test_chirp_perf_uses_fixed_fifteen_chirp_reference()) return 1; + if (test_success_reward_includes_chirp_efficiency_bonus()) return 1; + if (test_curriculum_perf_uses_distance_and_obstacle_difficulty()) return 1; + if (test_perf_composes_base_perf_curriculum_difficulty_and_chirp_perf()) return 1; + if (test_left_right_echo_asymmetry()) return 1; + if (test_directional_echo_arrival_and_gain_by_side()) return 1; + if (test_ear_directivity_gains_control_echo_energy()) return 1; + if (test_default_sound_speed_allows_one_tick_interaural_delay()) return 1; + if (test_echo_scheduling_uses_tick_bucket_accumulator()) return 1; + if (test_bug_wing_sidebands_spill_adjacent_bins_without_reward_inflation()) return 1; + if (test_ear_separation_scale_controls_arrival_gap()) return 1; + if (test_doppler_sign_for_approaching_bug()) return 1; + if (test_wall_collision_is_terminal_minus_one()) return 1; + if (test_catch_bug_is_terminal_plus_one()) return 1; + if (test_progress_reward_sign()) return 1; + if (test_bat_cannot_accelerate_backward_from_brake()) return 1; + if (test_bat_reset_starts_with_forward_stall_speed()) return 1; + if (test_bat_brake_clamps_to_forward_stall_speed()) return 1; + if (test_bat_velocity_is_locked_to_heading()) return 1; + if (test_bat_zero_speed_recovers_to_forward_arc()) return 1; + if (test_bat_turn_rate_scales_with_forward_speed()) return 1; + if (test_bat_speed_action_space_has_no_strafe()) return 1; + if (test_chirp_audio_maps_norm_freq_to_audible_sweep()) return 1; + if (test_chirp_audio_duration_scales_with_render_fps()) return 1; + if (test_chirp_cooldown_accepts_only_after_delay()) return 1; + if (test_valid_chirp_gets_reward()) return 1; + if (test_early_chirp_gets_penalty_and_emits_nothing()) return 1; + if (test_chirp_before_bug_echo_arrives_gets_scaled_overlap_penalty()) return 1; + if (test_chirp_after_bug_echo_arrives_ignores_static_echo_window()) return 1; + if (test_reflection_arrives_at_two_way_travel_time()) return 1; + if (test_bins_only_observation_layout()) return 1; + if (test_no_chirp_produces_silent_frequency_bins()) return 1; + if (test_observations_stay_normalized_after_chirp()) return 1; + if (test_curriculum_level_zero_starts_close_with_no_obstacles()) return 1; + if (test_curriculum_adds_first_obstacle_after_level_zero()) return 1; + if (test_curriculum_advances_after_catch()) return 1; + if (test_curriculum_waits_for_required_catches()) return 1; + if (test_curriculum_initial_level_sets_first_reset_difficulty()) return 1; + if (test_curriculum_initial_level_does_not_reset_progress()) return 1; + if (test_bug_bounces_off_arena_walls()) return 1; + if (test_chirp_echo_arrives_after_two_way_travel_not_immediately()) return 1; + if (test_default_echo_range_reaches_curriculum_max_bug_distance()) return 1; + if (test_corner_reflectors_enabled_schedule_stable_echo_events()) return 1; + if (test_corner_reflector_echo_observations_stay_normalized()) return 1; + if (test_frequency_bin_energy_sums_and_caps()) return 1; + if (test_bug_echo_reward_is_added_when_bug_echo_is_closer()) return 1; + if (test_bug_echo_reward_requires_bat_displacement()) return 1; + if (test_bug_echo_reward_penalizes_farther_bug_echo_weakly()) return 1; + if (test_static_echo_does_not_get_bug_echo_reward()) return 1; + if (test_spawns_use_curriculum_distance_with_random_positions()) return 1; + if (test_spawns_keep_minimum_separation_and_avoid_obstacles()) return 1; + if (test_obstacles_move_substantially_across_resets()) return 1; + if (test_obstacles_are_small_enough_for_trainability()) return 1; + + printf("bat core tests passed\n"); + return 0; +} diff --git a/pufferlib/sweep.py b/pufferlib/sweep.py index 36e27bf42a..219a9cb9eb 100644 --- a/pufferlib/sweep.py +++ b/pufferlib/sweep.py @@ -146,7 +146,9 @@ def _params_from_puffer_sweep(sweep_config, only_include=None): for name, param in sweep_config.items(): if name in ('method', 'metric', 'metric_distribution', 'goal', 'downsample', 'use_gpu', 'prune_pareto', - 'sweep_only', 'max_suggestion_cost', 'early_stop_quantile', 'gpus', 'max_runs'): + 'sweep_only', 'max_suggestion_cost', 'early_stop_quantile', 'gpus', + 'max_runs', 'match_enemy_model_path', 'match_num_games', + 'match_enemy_hidden_size', 'match_enemy_num_layers'): continue assert isinstance(param, dict), f'Param {name} is not a dict'