PPO playing BipedalWalker-v3 from https://github.com/sgoodfriend/rl-algo-impls/tree/983cb75e43e51cf4ef57f177194ab9a4a1a8808b
3f2a7a0
CartPole-v1: &cartpole-defaults | |
n_timesteps: !!float 1e5 | |
env_hyperparams: | |
n_envs: 8 | |
algo_hyperparams: | |
n_steps: 32 | |
batch_size: 256 | |
n_epochs: 20 | |
gae_lambda: 0.8 | |
gamma: 0.98 | |
ent_coef: 0.0 | |
learning_rate: 0.001 | |
learning_rate_decay: linear | |
clip_range: 0.2 | |
clip_range_decay: linear | |
eval_hyperparams: | |
step_freq: !!float 2.5e4 | |
CartPole-v0: | |
<<: *cartpole-defaults | |
n_timesteps: !!float 5e4 | |
MountainCar-v0: | |
n_timesteps: !!float 1e6 | |
env_hyperparams: | |
normalize: true | |
n_envs: 16 | |
algo_hyperparams: | |
n_steps: 16 | |
n_epochs: 4 | |
gae_lambda: 0.98 | |
gamma: 0.99 | |
ent_coef: 0.0 | |
MountainCarContinuous-v0: | |
n_timesteps: !!float 1e5 | |
env_hyperparams: | |
normalize: true | |
n_envs: 4 | |
# policy_hyperparams: | |
# init_layers_orthogonal: false | |
# log_std_init: -3.29 | |
# use_sde: true | |
algo_hyperparams: | |
n_steps: 512 | |
batch_size: 256 | |
n_epochs: 10 | |
learning_rate: !!float 7.77e-5 | |
ent_coef: 0.01 # 0.00429 | |
ent_coef_decay: linear | |
clip_range: 0.1 | |
gae_lambda: 0.9 | |
max_grad_norm: 5 | |
vf_coef: 0.19 | |
eval_hyperparams: | |
step_freq: 5000 | |
Acrobot-v1: | |
n_timesteps: !!float 1e6 | |
env_hyperparams: | |
n_envs: 16 | |
normalize: true | |
algo_hyperparams: | |
n_steps: 256 | |
n_epochs: 4 | |
gae_lambda: 0.94 | |
gamma: 0.99 | |
ent_coef: 0.0 | |
LunarLander-v2: | |
n_timesteps: !!float 4e6 | |
env_hyperparams: | |
n_envs: 16 | |
algo_hyperparams: | |
n_steps: 1024 | |
batch_size: 64 | |
n_epochs: 4 | |
gae_lambda: 0.98 | |
gamma: 0.999 | |
learning_rate: !!float 5e-4 | |
learning_rate_decay: linear | |
clip_range: 0.2 | |
clip_range_decay: linear | |
ent_coef: 0.01 | |
normalize_advantage: false | |
BipedalWalker-v3: | |
n_timesteps: !!float 10e6 | |
env_hyperparams: | |
n_envs: 16 | |
normalize: true | |
algo_hyperparams: | |
n_steps: 2048 | |
batch_size: 64 | |
gae_lambda: 0.95 | |
gamma: 0.99 | |
n_epochs: 10 | |
ent_coef: 0.001 | |
learning_rate: !!float 2.5e-4 | |
learning_rate_decay: linear | |
clip_range: 0.2 | |
clip_range_decay: linear | |
CarRacing-v0: &carracing-defaults | |
n_timesteps: !!float 4e6 | |
env_hyperparams: | |
n_envs: 8 | |
frame_stack: 4 | |
policy_hyperparams: &carracing-policy-defaults | |
use_sde: true | |
log_std_init: -2 | |
init_layers_orthogonal: false | |
activation_fn: relu | |
share_features_extractor: false | |
cnn_flatten_dim: 256 | |
hidden_sizes: [256] | |
algo_hyperparams: | |
n_steps: 512 | |
batch_size: 128 | |
n_epochs: 10 | |
learning_rate: !!float 1e-4 | |
learning_rate_decay: linear | |
gamma: 0.99 | |
gae_lambda: 0.95 | |
ent_coef: 0.0 | |
sde_sample_freq: 4 | |
max_grad_norm: 0.5 | |
vf_coef: 0.5 | |
clip_range: 0.2 | |
impala-CarRacing-v0: | |
<<: *carracing-defaults | |
env_id: CarRacing-v0 | |
policy_hyperparams: | |
<<: *carracing-policy-defaults | |
cnn_style: impala | |
init_layers_orthogonal: true | |
cnn_layers_init_orthogonal: false | |
hidden_sizes: [] | |
# BreakoutNoFrameskip-v4 | |
# PongNoFrameskip-v4 | |
# SpaceInvadersNoFrameskip-v4 | |
# QbertNoFrameskip-v4 | |
_atari: &atari-defaults | |
n_timesteps: !!float 1e7 | |
env_hyperparams: &atari-env-defaults | |
n_envs: 8 | |
frame_stack: 4 | |
no_reward_timeout_steps: 1000 | |
no_reward_fire_steps: 500 | |
vec_env_class: async | |
policy_hyperparams: &atari-policy-defaults | |
activation_fn: relu | |
algo_hyperparams: &atari-algo-defaults | |
n_steps: 128 | |
batch_size: 256 | |
n_epochs: 4 | |
learning_rate: !!float 2.5e-4 | |
learning_rate_decay: linear | |
clip_range: 0.1 | |
clip_range_decay: linear | |
vf_coef: 0.5 | |
ent_coef: 0.01 | |
eval_hyperparams: | |
deterministic: false | |
_norm-rewards-atari: &norm-rewards-atari-default | |
<<: *atari-defaults | |
env_hyperparams: | |
<<: *atari-env-defaults | |
clip_atari_rewards: false | |
normalize: true | |
normalize_kwargs: | |
norm_obs: false | |
norm_reward: true | |
norm-rewards-BreakoutNoFrameskip-v4: | |
<<: *norm-rewards-atari-default | |
env_id: BreakoutNoFrameskip-v4 | |
debug-PongNoFrameskip-v4: | |
<<: *atari-defaults | |
device: cpu | |
env_id: PongNoFrameskip-v4 | |
env_hyperparams: | |
<<: *atari-env-defaults | |
vec_env_class: sync | |
_impala-atari: &impala-atari-defaults | |
<<: *atari-defaults | |
policy_hyperparams: | |
<<: *atari-policy-defaults | |
cnn_style: impala | |
cnn_flatten_dim: 256 | |
init_layers_orthogonal: true | |
cnn_layers_init_orthogonal: false | |
impala-PongNoFrameskip-v4: | |
<<: *impala-atari-defaults | |
env_id: PongNoFrameskip-v4 | |
impala-BreakoutNoFrameskip-v4: | |
<<: *impala-atari-defaults | |
env_id: BreakoutNoFrameskip-v4 | |
impala-SpaceInvadersNoFrameskip-v4: | |
<<: *impala-atari-defaults | |
env_id: SpaceInvadersNoFrameskip-v4 | |
impala-QbertNoFrameskip-v4: | |
<<: *impala-atari-defaults | |
env_id: QbertNoFrameskip-v4 | |
_microrts: µrts-defaults | |
<<: *atari-defaults | |
n_timesteps: !!float 2e6 | |
env_hyperparams: µrts-env-defaults | |
n_envs: 8 | |
vec_env_class: sync | |
mask_actions: true | |
policy_hyperparams: µrts-policy-defaults | |
<<: *atari-policy-defaults | |
cnn_style: microrts | |
cnn_flatten_dim: 128 | |
algo_hyperparams: µrts-algo-defaults | |
<<: *atari-algo-defaults | |
clip_range_decay: none | |
clip_range_vf: 0.1 | |
ppo2_vf_coef_halving: true | |
eval_hyperparams: µrts-eval-defaults | |
deterministic: false # Good idea because MultiCategorical mode isn't great | |
_no-mask-microrts: &no-mask-microrts-defaults | |
<<: *microrts-defaults | |
env_hyperparams: | |
<<: *microrts-env-defaults | |
mask_actions: false | |
MicrortsMining-v1-NoMask: | |
<<: *no-mask-microrts-defaults | |
env_id: MicrortsMining-v1 | |
MicrortsAttackShapedReward-v1-NoMask: | |
<<: *no-mask-microrts-defaults | |
env_id: MicrortsAttackShapedReward-v1 | |
MicrortsRandomEnemyShapedReward3-v1-NoMask: | |
<<: *no-mask-microrts-defaults | |
env_id: MicrortsRandomEnemyShapedReward3-v1 | |
_microrts_ai: µrts-ai-defaults | |
<<: *microrts-defaults | |
n_timesteps: !!float 100e6 | |
additional_keys_to_log: ["microrts_stats", "microrts_results"] | |
env_hyperparams: µrts-ai-env-defaults | |
n_envs: 24 | |
env_type: microrts | |
make_kwargs: µrts-ai-env-make-kwargs-defaults | |
num_selfplay_envs: 0 | |
max_steps: 4000 | |
render_theme: 2 | |
map_paths: [maps/16x16/basesWorkers16x16.xml] | |
reward_weight: [10.0, 1.0, 1.0, 0.2, 1.0, 4.0] | |
policy_hyperparams: µrts-ai-policy-defaults | |
<<: *microrts-policy-defaults | |
cnn_flatten_dim: 256 | |
actor_head_style: gridnet | |
algo_hyperparams: µrts-ai-algo-defaults | |
<<: *microrts-algo-defaults | |
learning_rate: !!float 2.5e-4 | |
learning_rate_decay: linear | |
n_steps: 512 | |
batch_size: 3072 | |
n_epochs: 4 | |
ent_coef: 0.01 | |
vf_coef: 0.5 | |
max_grad_norm: 0.5 | |
clip_range: 0.1 | |
clip_range_vf: 0.1 | |
eval_hyperparams: µrts-ai-eval-defaults | |
<<: *microrts-eval-defaults | |
score_function: mean | |
max_video_length: 4000 | |
env_overrides: µrts-ai-eval-env-overrides | |
make_kwargs: | |
<<: *microrts-ai-env-make-kwargs-defaults | |
max_steps: 4000 | |
reward_weight: [1.0, 0, 0, 0, 0, 0] | |
MicrortsAttackPassiveEnemySparseReward-v3: | |
<<: *microrts-ai-defaults | |
n_timesteps: !!float 2e6 | |
env_id: MicrortsAttackPassiveEnemySparseReward-v3 # Workaround to keep model name simple | |
env_hyperparams: | |
<<: *microrts-ai-env-defaults | |
bots: | |
passiveAI: 24 | |
MicrortsDefeatRandomEnemySparseReward-v3: µrts-random-ai-defaults | |
<<: *microrts-ai-defaults | |
n_timesteps: !!float 2e6 | |
env_id: MicrortsDefeatRandomEnemySparseReward-v3 # Workaround to keep model name simple | |
env_hyperparams: | |
<<: *microrts-ai-env-defaults | |
bots: | |
randomBiasedAI: 24 | |
enc-dec-MicrortsDefeatRandomEnemySparseReward-v3: | |
<<: *microrts-random-ai-defaults | |
policy_hyperparams: | |
<<: *microrts-ai-policy-defaults | |
cnn_style: gridnet_encoder | |
actor_head_style: gridnet_decoder | |
v_hidden_sizes: [128] | |
unet-MicrortsDefeatRandomEnemySparseReward-v3: | |
<<: *microrts-random-ai-defaults | |
# device: cpu | |
policy_hyperparams: | |
<<: *microrts-ai-policy-defaults | |
actor_head_style: unet | |
v_hidden_sizes: [256, 128] | |
algo_hyperparams: | |
<<: *microrts-ai-algo-defaults | |
learning_rate: !!float 2.5e-4 | |
learning_rate_decay: spike | |
MicrortsDefeatCoacAIShaped-v3: µrts-coacai-defaults | |
<<: *microrts-ai-defaults | |
env_id: MicrortsDefeatCoacAIShaped-v3 # Workaround to keep model name simple | |
n_timesteps: !!float 300e6 | |
env_hyperparams: µrts-coacai-env-defaults | |
<<: *microrts-ai-env-defaults | |
bots: | |
coacAI: 24 | |
eval_hyperparams: µrts-coacai-eval-defaults | |
<<: *microrts-ai-eval-defaults | |
step_freq: !!float 1e6 | |
n_episodes: 26 | |
env_overrides: µrts-coacai-eval-env-overrides | |
<<: *microrts-ai-eval-env-overrides | |
n_envs: 26 | |
bots: | |
coacAI: 2 | |
randomBiasedAI: 2 | |
randomAI: 2 | |
passiveAI: 2 | |
workerRushAI: 2 | |
lightRushAI: 2 | |
naiveMCTSAI: 2 | |
mixedBot: 2 | |
rojo: 2 | |
izanagi: 2 | |
tiamat: 2 | |
droplet: 2 | |
guidedRojoA3N: 2 | |
MicrortsDefeatCoacAIShaped-v3-diverseBots: µrts-diverse-defaults | |
<<: *microrts-coacai-defaults | |
env_hyperparams: | |
<<: *microrts-coacai-env-defaults | |
bots: | |
coacAI: 18 | |
randomBiasedAI: 2 | |
lightRushAI: 2 | |
workerRushAI: 2 | |
enc-dec-MicrortsDefeatCoacAIShaped-v3-diverseBots: | |
µrts-env-dec-diverse-defaults | |
<<: *microrts-diverse-defaults | |
policy_hyperparams: | |
<<: *microrts-ai-policy-defaults | |
cnn_style: gridnet_encoder | |
actor_head_style: gridnet_decoder | |
v_hidden_sizes: [128] | |
debug-enc-dec-MicrortsDefeatCoacAIShaped-v3-diverseBots: | |
<<: *microrts-env-dec-diverse-defaults | |
n_timesteps: !!float 1e6 | |
unet-MicrortsDefeatCoacAIShaped-v3-diverseBots: µrts-unet-defaults | |
<<: *microrts-diverse-defaults | |
policy_hyperparams: | |
<<: *microrts-ai-policy-defaults | |
actor_head_style: unet | |
v_hidden_sizes: [256, 128] | |
algo_hyperparams: µrts-unet-algo-defaults | |
<<: *microrts-ai-algo-defaults | |
learning_rate: !!float 2.5e-4 | |
learning_rate_decay: spike | |
Microrts-selfplay-unet: µrts-selfplay-defaults | |
<<: *microrts-unet-defaults | |
env_hyperparams: µrts-selfplay-env-defaults | |
<<: *microrts-ai-env-defaults | |
make_kwargs: µrts-selfplay-env-make-kwargs-defaults | |
<<: *microrts-ai-env-make-kwargs-defaults | |
num_selfplay_envs: 36 | |
self_play_kwargs: | |
num_old_policies: 12 | |
save_steps: 300000 | |
swap_steps: 6000 | |
swap_window_size: 4 | |
window: 33 | |
eval_hyperparams: µrts-selfplay-eval-defaults | |
<<: *microrts-coacai-eval-defaults | |
env_overrides: µrts-selfplay-eval-env-overrides | |
<<: *microrts-coacai-eval-env-overrides | |
self_play_kwargs: {} | |
Microrts-selfplay-unet-winloss: µrts-selfplay-winloss-defaults | |
<<: *microrts-selfplay-defaults | |
env_hyperparams: | |
<<: *microrts-selfplay-env-defaults | |
make_kwargs: | |
<<: *microrts-selfplay-env-make-kwargs-defaults | |
reward_weight: [1.0, 0, 0, 0, 0, 0] | |
algo_hyperparams: µrts-selfplay-winloss-algo-defaults | |
<<: *microrts-unet-algo-defaults | |
gamma: 0.999 | |
Microrts-selfplay-unet-decay: µrts-selfplay-decay-defaults | |
<<: *microrts-selfplay-defaults | |
microrts_reward_decay_callback: true | |
algo_hyperparams: | |
<<: *microrts-unet-algo-defaults | |
gamma_end: 0.999 | |
Microrts-selfplay-unet-debug: µrts-selfplay-debug-defaults | |
<<: *microrts-selfplay-decay-defaults | |
eval_hyperparams: | |
<<: *microrts-selfplay-eval-defaults | |
step_freq: !!float 1e5 | |
env_overrides: | |
<<: *microrts-selfplay-eval-env-overrides | |
n_envs: 24 | |
bots: | |
coacAI: 12 | |
randomBiasedAI: 4 | |
workerRushAI: 4 | |
lightRushAI: 4 | |
Microrts-selfplay-unet-debug-mps: | |
<<: *microrts-selfplay-debug-defaults | |
device: mps | |
HalfCheetahBulletEnv-v0: &pybullet-defaults | |
n_timesteps: !!float 2e6 | |
env_hyperparams: &pybullet-env-defaults | |
n_envs: 16 | |
normalize: true | |
policy_hyperparams: &pybullet-policy-defaults | |
pi_hidden_sizes: [256, 256] | |
v_hidden_sizes: [256, 256] | |
activation_fn: relu | |
algo_hyperparams: &pybullet-algo-defaults | |
n_steps: 512 | |
batch_size: 128 | |
n_epochs: 20 | |
gamma: 0.99 | |
gae_lambda: 0.9 | |
ent_coef: 0.0 | |
max_grad_norm: 0.5 | |
vf_coef: 0.5 | |
learning_rate: !!float 3e-5 | |
clip_range: 0.4 | |
AntBulletEnv-v0: | |
<<: *pybullet-defaults | |
policy_hyperparams: | |
<<: *pybullet-policy-defaults | |
algo_hyperparams: | |
<<: *pybullet-algo-defaults | |
Walker2DBulletEnv-v0: | |
<<: *pybullet-defaults | |
algo_hyperparams: | |
<<: *pybullet-algo-defaults | |
clip_range_decay: linear | |
HopperBulletEnv-v0: | |
<<: *pybullet-defaults | |
algo_hyperparams: | |
<<: *pybullet-algo-defaults | |
clip_range_decay: linear | |
HumanoidBulletEnv-v0: | |
<<: *pybullet-defaults | |
n_timesteps: !!float 1e7 | |
env_hyperparams: | |
<<: *pybullet-env-defaults | |
n_envs: 8 | |
policy_hyperparams: | |
<<: *pybullet-policy-defaults | |
# log_std_init: -1 | |
algo_hyperparams: | |
<<: *pybullet-algo-defaults | |
n_steps: 2048 | |
batch_size: 64 | |
n_epochs: 10 | |
gae_lambda: 0.95 | |
learning_rate: !!float 2.5e-4 | |
clip_range: 0.2 | |
_procgen: &procgen-defaults | |
env_hyperparams: &procgen-env-defaults | |
env_type: procgen | |
n_envs: 64 | |
# grayscale: false | |
# frame_stack: 4 | |
normalize: true # procgen only normalizes reward | |
make_kwargs: &procgen-make-kwargs-defaults | |
num_threads: 8 | |
policy_hyperparams: &procgen-policy-defaults | |
activation_fn: relu | |
cnn_style: impala | |
cnn_flatten_dim: 256 | |
init_layers_orthogonal: true | |
cnn_layers_init_orthogonal: false | |
algo_hyperparams: &procgen-algo-defaults | |
gamma: 0.999 | |
gae_lambda: 0.95 | |
n_steps: 256 | |
batch_size: 2048 | |
n_epochs: 3 | |
ent_coef: 0.01 | |
clip_range: 0.2 | |
# clip_range_decay: linear | |
clip_range_vf: 0.2 | |
learning_rate: !!float 5e-4 | |
# learning_rate_decay: linear | |
vf_coef: 0.5 | |
eval_hyperparams: &procgen-eval-defaults | |
ignore_first_episode: true | |
# deterministic: false | |
step_freq: !!float 1e5 | |
_procgen-easy: &procgen-easy-defaults | |
<<: *procgen-defaults | |
n_timesteps: !!float 25e6 | |
env_hyperparams: &procgen-easy-env-defaults | |
<<: *procgen-env-defaults | |
make_kwargs: | |
<<: *procgen-make-kwargs-defaults | |
distribution_mode: easy | |
procgen-coinrun-easy: &coinrun-easy-defaults | |
<<: *procgen-easy-defaults | |
env_id: coinrun | |
debug-procgen-coinrun: | |
<<: *coinrun-easy-defaults | |
device: cpu | |
procgen-starpilot-easy: | |
<<: *procgen-easy-defaults | |
env_id: starpilot | |
procgen-bossfight-easy: | |
<<: *procgen-easy-defaults | |
env_id: bossfight | |
procgen-bigfish-easy: | |
<<: *procgen-easy-defaults | |
env_id: bigfish | |
_procgen-hard: &procgen-hard-defaults | |
<<: *procgen-defaults | |
n_timesteps: !!float 200e6 | |
env_hyperparams: &procgen-hard-env-defaults | |
<<: *procgen-env-defaults | |
n_envs: 256 | |
make_kwargs: | |
<<: *procgen-make-kwargs-defaults | |
distribution_mode: hard | |
algo_hyperparams: &procgen-hard-algo-defaults | |
<<: *procgen-algo-defaults | |
batch_size: 8192 | |
clip_range_decay: linear | |
learning_rate_decay: linear | |
eval_hyperparams: | |
<<: *procgen-eval-defaults | |
step_freq: !!float 5e5 | |
procgen-starpilot-hard: &procgen-starpilot-hard-defaults | |
<<: *procgen-hard-defaults | |
env_id: starpilot | |
procgen-starpilot-hard-2xIMPALA: | |
<<: *procgen-starpilot-hard-defaults | |
policy_hyperparams: | |
<<: *procgen-policy-defaults | |
impala_channels: [32, 64, 64] | |
algo_hyperparams: | |
<<: *procgen-hard-algo-defaults | |
learning_rate: !!float 3.3e-4 | |
procgen-starpilot-hard-2xIMPALA-fat: | |
<<: *procgen-starpilot-hard-defaults | |
policy_hyperparams: | |
<<: *procgen-policy-defaults | |
impala_channels: [32, 64, 64] | |
cnn_flatten_dim: 512 | |
algo_hyperparams: | |
<<: *procgen-hard-algo-defaults | |
learning_rate: !!float 2.5e-4 | |
procgen-starpilot-hard-4xIMPALA: | |
<<: *procgen-starpilot-hard-defaults | |
policy_hyperparams: | |
<<: *procgen-policy-defaults | |
impala_channels: [64, 128, 128] | |
algo_hyperparams: | |
<<: *procgen-hard-algo-defaults | |
learning_rate: !!float 2.1e-4 | |