het_control/conf/experiment/het_control_experiment.yaml

defaults:
  - experiment_config
  - _self_

# The device for collection (e.g. cuda)
sampling_device: "cuda"
# The device for training (e.g. cuda)
train_device: "cuda"

# Whether to share the parameters of the policy within agent groups
share_policy_params: True # This won't matter as our model ignores it
# If an algorithm and an env support both continuous and discrete actions, what should be preferred
prefer_continuous_actions: True

# Discount factor
gamma: 0.9
# Learning rate
lr: 0.00005
# Clips grad norm if true and clips grad value if false
clip_grad_norm: True
# The value for the clipping, if null no clipping
clip_grad_val: 5
## The epsilon parameter of the adam optimizer
adam_eps: 0.00001

# Whether to use soft or hard target updates
soft_target_update: True
# If soft_target_update is True, this is its polyak_tau
polyak_tau: 0.005
# If soft_target_update is False, this is the frequency of the hard trarget updates in terms of n_optimizer_steps
hard_target_update_frequency: 5

# When an exploration wrapper is used. This is its initial epsilon for annealing
exploration_eps_init: 0.8
# When an exploration wrapper is used. This is its final epsilon after annealing
exploration_eps_end: 0.01
# Number of frames for annealing of exploration strategy in deterministic policy algorithms
# If null it will default to max_n_frames / 3
exploration_anneal_frames: 1_000_000

# The maximum number of experiment iterations before the experiment terminates, exclusive with max_n_frames
max_n_iters: null
# Number of collected frames before ending, exclusive with max_n_iters
max_n_frames: 10_000_000

# Number of frames collected and each experiment iteration
on_policy_collected_frames_per_batch: 60_000
# Number of environments used for collection
# If the environment is vectorized, this will be the number of batched environments.
# Otherwise batching will be simulated and each env will be run sequentially.
on_policy_n_envs_per_worker: 600
# This is the number of times collected_frames_per_batch will be split into minibatches and trained
on_policy_n_minibatch_iters: 45
# In on-policy algorithms the train_batch_size will be equal to the on_policy_collected_frames_per_batch
# and it will be split into minibatches with this number of frames for training
on_policy_minibatch_size: 4096

# Number of frames collected and each experiment iteration
off_policy_collected_frames_per_batch: 6000
# Number of environments used for collection
# If the environment is vectorized, this will be the number of batched environments.
# Otherwise batching will be simulated and each env will be run sequenti
off_policy_n_envs_per_worker: 60
# This is the number of times off_policy_train_batch_size will be sampled from the buffer and trained over.
off_policy_n_optimizer_steps: 1000
# Number of frames used for each off_policy_n_optimizer_steps when training off-policy algorithms
off_policy_train_batch_size: 128
# Maximum number of frames to keep in replay buffer memory for off-policy algorithms
off_policy_memory_size: 1_000_000
# Number of random action frames to prefill the replay buffer with
off_policy_init_random_frames: 0

evaluation: True
# Whether to render the evaluation (if rendering is available
render: True
# Frequency of evaluation in terms of collected frames (this should be a multiple of on/off_policy_collected_frames_per_batch)
evaluation_interval: 120_000
# Number of episodes that evaluation is run on
evaluation_episodes: 200
# If True, when stochastic policies are evaluated, their mode is taken, otherwise, if False, they are sampled
evaluation_deterministic_actions: False

# Absolute path to the folder where the experiment will log.
# If null, this will default to the hydra output dir (if using hydra) or to the current folder when the script is run (if not).
save_folder: null
# Absolute path to a checkpoint file where the experiment was saved. If null the experiment is started fresh.
restore_file: null
# Interval for experiment saving in terms of collected frames (this should be a multiple of on/off_policy_collected_frames_per_batch).
# Set it to 0 to disable checkpointing
checkpoint_interval: 0

# List of loggers to use, options are: wandb, csv, tensorboard, mflow
loggers: [wandb]
# Create a json folder as part of the output in the format of marl-eval
create_json: True