Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
a417266
Trains and Evals
Kinvert Jan 13, 2026
49af2d4
Reward Changes
Kinvert Jan 13, 2026
daaf902
Rendered with spheres or something
Kinvert Jan 13, 2026
332a9ae
Good Claude - Wireframe Planes
Kinvert Jan 13, 2026
0116b97
Physics model: incidence, comments, test suite
Kinvert Jan 13, 2026
b29bf5a
Renamed md Files
Kinvert Jan 13, 2026
95eb2ef
Moved Physics to File
Kinvert Jan 13, 2026
3582d2d
Physics in Own File - Test Flights
Kinvert Jan 14, 2026
1c30c54
Coordinated Turn Tests
Kinvert Jan 14, 2026
1131e83
Simple Optimizations
Kinvert Jan 14, 2026
374871d
Small Perf - Move cosf Out of Loop
Kinvert Jan 14, 2026
8598067
Autopilot Seperate File
Kinvert Jan 14, 2026
80bcf31
Vectorized Autopilot
Kinvert Jan 14, 2026
0a1c2e6
Weighted Random Actions
Kinvert Jan 15, 2026
63a7aae
Observation Schemas Swept
Kinvert Jan 15, 2026
04dd016
Rewards Fixed - Sweepable
Kinvert Jan 15, 2026
26709b9
Preparing for Sweeps
Kinvert Jan 15, 2026
a31d1dc
Fix Terminals and Loggin
Kinvert Jan 15, 2026
3cc5b58
More Sweep Prep
Kinvert Jan 15, 2026
17f18c1
Fix Reward and Score
Kinvert Jan 15, 2026
d639ee3
Temp Undo Later - Clamp logstd
Kinvert Jan 15, 2026
2606e20
Apply Sweep df1 84 u5i33hej
Kinvert Jan 16, 2026
bc72836
New Obs Schemas - New Sweep Prep
Kinvert Jan 16, 2026
fe7e26a
Roll Penalty - Elevator Might Be Inversed
Kinvert Jan 16, 2026
652ab7a
Fix Elevator Problems
Kinvert Jan 17, 2026
30fa9fe
Fix Obs 5 Schema and Adjust Penalties
Kinvert Jan 17, 2026
ab222bf
Increase Batch Size for Speed
Kinvert Jan 17, 2026
7fd88f1
Next Sweep Improvements - Likes to Aileron Roll too Much
Kinvert Jan 17, 2026
9dca5c6
Reduce Prints
Kinvert Jan 17, 2026
b68d1b2
Simplify Penalties and Rewards
Kinvert Jan 18, 2026
03d1ebc
Try to Avoid NAN
Kinvert Jan 18, 2026
7a15539
Trying to Stop NANs
Kinvert Jan 18, 2026
2c3073f
Debug Prints
Kinvert Jan 18, 2026
be1e31c
Fix Mean Outside Bounds
Kinvert Jan 18, 2026
f6c821d
Still Trying to Fix Blowups
Kinvert Jan 18, 2026
3f0f8b4
Revert Some Ini Values
Kinvert Jan 18, 2026
6c61df6
Restore Much of Ini to 9dca5c6
Kinvert Jan 18, 2026
faf6eb6
Reduce Learning Rate Again
Kinvert Jan 18, 2026
4e640ee
Trying to Fix Curriculum - Agent Trains Poorly
Kinvert Jan 18, 2026
f302224
Aim Annealing - Removed Some Penalties
Kinvert Jan 19, 2026
f000fb8
Added More Debugging
Kinvert Jan 19, 2026
7a75d2b
Some Fixes - SPS Gains - New Sweep Soon
Kinvert Jan 19, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,4 @@ pufferlib/ocean/impulse_wars/*-release/
pufferlib/ocean/impulse_wars/debug-*/
pufferlib/ocean/impulse_wars/release-*/
pufferlib/ocean/impulse_wars/benchmark/
pufferlib/ocean/dogfight/dogfight_test
19 changes: 6 additions & 13 deletions pufferlib/config/default.ini
Original file line number Diff line number Diff line change
Expand Up @@ -28,24 +28,24 @@ device = cuda
optimizer = muon
anneal_lr = True
precision = float32
total_timesteps = 10_000_000
total_timesteps = 100_000_000
learning_rate = 0.015
gamma = 0.995
gae_lambda = 0.90
gae_lambda = 0.95
update_epochs = 1
clip_coef = 0.2
vf_coef = 2.0
vf_clip_coef = 0.2
max_grad_norm = 1.5
ent_coef = 0.001
ent_coef = 0.01
adam_beta1 = 0.95
adam_beta2 = 0.999
adam_eps = 1e-12

data_dir = experiments
checkpoint_interval = 200
batch_size = auto
minibatch_size = 8192
minibatch_size = 16384

# Accumulate gradients above this size
max_minibatch_size = 32768
Expand All @@ -58,7 +58,7 @@ vtrace_rho_clip = 1.0
vtrace_c_clip = 1.0

prio_alpha = 0.8
prio_beta0 = 0.2
prio_beta0 = 0.5

[sweep]
method = Protein
Expand All @@ -83,16 +83,9 @@ max = 1e10
mean = 2e8
scale = time

[sweep.train.bptt_horizon]
distribution = uniform_pow2
min = 16
max = 64
mean = 64
scale = auto

[sweep.train.minibatch_size]
distribution = uniform_pow2
min = 8192
min = 16384
max = 65536
mean = 32768
scale = auto
Expand Down
206 changes: 206 additions & 0 deletions pufferlib/config/ocean/dogfight.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
[base]
env_name = puffer_dogfight
package = ocean
policy_name = Policy
rnn_name = Recurrent

[policy]
hidden_size = 128

[rnn]
input_size = 128
hidden_size = 128

[vec]
num_envs = 8

[env]
alt_max = 2500.0
curriculum_enabled = 1
curriculum_randomize = 0
episodes_per_stage = 25
max_steps = 3000
num_envs = 1024
obs_scheme = 4
penalty_aileron = 0.004787828722037375
penalty_bias = 0.019452434551902115
penalty_neg_g = 0.038022669870406395
penalty_roll = 0.0019647147422656415
penalty_rudder = 0.00015276678362861277
penalty_stall = 0.0007385806553065777
reward_approach = 0.0065743024460971355
reward_closing_scale = 0.0011914868978783488
reward_firing_solution = 0.045721526537090544
reward_level = 0.025920397927984597
reward_tail_scale = 0.0009967820532619954
reward_tracking = 0.031819639401510356
speed_min = 50.0
aim_cone_start = 0.3856851533800364
aim_cone_end = 0.05015228554606438
aim_anneal_episodes = 500

[train]
adam_beta1 = 0.9723082880428708
adam_beta2 = 0.9912225347178505
adam_eps = 8.183951125996682e-13
batch_size = auto
bptt_horizon = 64
checkpoint_interval = 200
clip_coef = 0.983341504810378
ent_coef = 0.03071064008271062
gae_lambda = 0.9949418302404375
gamma = 0.9855692943246729
learning_rate = 0.0003102693135543651
max_grad_norm = 1.955089159309864
max_minibatch_size = 32768
minibatch_size = 65536
prio_alpha = 0.9022484586887103
prio_beta0 = 0.8983571008600393
seed = 42
total_timesteps = 100_000_000
update_epochs = 4
vf_clip_coef = 0.4664481597021223
vf_coef = 1.3376509584486485
vtrace_c_clip = 0.4391395812854171
vtrace_rho_clip = 4.6142582874745

[sweep]
downsample = 1
goal = maximize
method = Protein
metric = ultimate
prune_pareto = True
use_gpu = True

[sweep.env.obs_scheme]
distribution = int_uniform
max = 5
mean = 0
min = 0
scale = 1.0

[sweep.env.episodes_per_stage]
distribution = int_uniform
min = 20
max = 75
mean = 25
scale = 1.0

[sweep.env.penalty_stall]
distribution = uniform
max = 0.005
mean = 0.0016092406492793122
min = 0.0
scale = auto

[sweep.env.penalty_roll]
distribution = uniform
max = 0.003
mean = 0.0021072644960864573
min = 0.0
scale = auto

[sweep.env.penalty_neg_g]
distribution = uniform
max = 0.1
mean = 0.05
min = 0.01
scale = auto

[sweep.env.penalty_rudder]
distribution = uniform
max = 0.001
mean = 0.0002985792260932028
min = 0.0001
scale = auto

[sweep.env.penalty_aileron]
distribution = uniform
max = 0.005
mean = 0.002
min = 0.0
scale = auto

[sweep.env.penalty_bias]
distribution = uniform
max = 0.02
mean = 0.008614029763839244
min = 0.001
scale = auto

[sweep.env.reward_approach]
distribution = uniform
max = 0.02
mean = 0.003836667464147351
min = 0.0
scale = auto

[sweep.env.reward_level]
distribution = uniform
max = 0.05
mean = 0.029797846539013125
min = 0.0
scale = auto

[sweep.env.reward_closing_scale]
distribution = uniform
max = 0.005
mean = 0.005
min = 0.0
scale = auto

[sweep.env.reward_firing_solution]
distribution = uniform
max = 0.1
mean = 0.01
min = 0.0
scale = auto

[sweep.env.reward_tail_scale]
distribution = uniform
max = 0.01
mean = 0.005
min = 0.0
scale = auto

[sweep.env.reward_tracking]
distribution = uniform
max = 0.05
mean = 0.005177132307187232
min = 0.0
scale = auto

[sweep.env.aim_cone_start]
distribution = uniform
max = 0.52
mean = 0.35
min = 0.17
scale = auto

[sweep.env.aim_cone_end]
distribution = uniform
max = 0.17
mean = 0.087
min = 0.05
scale = auto

[sweep.env.aim_anneal_episodes]
distribution = int_uniform
min = 50
max = 5000
mean = 500
scale = 1.0

[sweep.train.learning_rate]
distribution = log_normal
max = 0.0005
mean = 9.0e-06
min = 0.000000001
scale = 0.5

[sweep.train.total_timesteps]
distribution = log_normal
max = 1.01e8
mean = 1.005e8
min = 1.0e8
scale = time
2 changes: 1 addition & 1 deletion pufferlib/environments/mani_skill/torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def decode_actions(self, hidden):
'''Decodes a batch of hidden states into (multi)discrete actions.
Assumes no time dimension (handled by LSTM wrappers).'''
mean = self.decoder_mean(hidden)
logstd = self.decoder_logstd.expand_as(mean)
logstd = self.decoder_logstd.expand_as(mean).clamp(min=-20, max=2)
std = torch.exp(logstd)
logits = torch.distributions.Normal(mean, std)
values = self.value(hidden)
Expand Down
2 changes: 1 addition & 1 deletion pufferlib/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def decode_actions(self, hidden):
logits = self.decoder(hidden).split(self.action_nvec, dim=1)
elif self.is_continuous:
mean = self.decoder_mean(hidden)
logstd = self.decoder_logstd.expand_as(mean)
logstd = self.decoder_logstd.expand_as(mean).clamp(min=-20, max=2)
std = torch.exp(logstd)
logits = torch.distributions.Normal(mean, std)
else:
Expand Down
Loading