Skip to content

Commit

Permalink
Refactor random seed (#49)
Browse files Browse the repository at this point in the history
* Refactor random seed
  • Loading branch information
LucasAlegre authored Mar 30, 2023
1 parent 042f4c0 commit 0ebefc9
Show file tree
Hide file tree
Showing 13 changed files with 65 additions and 81 deletions.
3 changes: 3 additions & 0 deletions benchmark/launch_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import requests
from mo_gymnasium.utils import MORecordEpisodeStatistics

from morl_baselines.common.utils import seed_everything
from morl_baselines.multi_policy.envelope.envelope import Envelope
from morl_baselines.multi_policy.gpi_pd.gpi_pd import GPIPD
from morl_baselines.multi_policy.gpi_pd.gpi_pd_continuous_action import (
Expand Down Expand Up @@ -143,6 +144,8 @@ def main():
args = parse_args()
print(args)

seed_everything(args.seed)

if args.auto_tag:
if "WANDB_TAGS" in os.environ:
raise ValueError(
Expand Down
6 changes: 4 additions & 2 deletions morl_baselines/common/morl_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,19 +160,21 @@ def update(self) -> None:
class MOAgent(ABC):
"""An MORL Agent, can contain one or multiple MOPolicies. Contains helpers to extract features from the environment, setup logging etc."""

def __init__(self, env: Optional[gym.Env], device: Union[th.device, str] = "auto") -> None:
def __init__(self, env: Optional[gym.Env], device: Union[th.device, str] = "auto", seed: Optional[int] = None) -> None:
"""Initializes the agent.
Args:
env: (gym.Env): The environment
device: (str): The device to use for training. Can be "auto", "cpu" or "cuda".
seed: (int): The seed to use for the random number generator
"""
self.extract_env_info(env)
self.device = th.device("cuda" if th.cuda.is_available() else "cpu") if device == "auto" else device

self.global_step = 0
self.num_episodes = 0
self.seed = None
self.seed = seed
self.np_random = np.random.default_rng(self.seed)

def extract_env_info(self, env: Optional[gym.Env]) -> None:
"""Extracts all the features of the environment: observation space, action space, ...
Expand Down
16 changes: 10 additions & 6 deletions morl_baselines/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,22 +151,23 @@ def equally_spaced_weights(dim: int, n: int, seed: int = 42) -> List[np.ndarray]
return list(get_reference_directions("energy", dim, n, seed=seed))


def random_weights(dim: int, seed: Optional[int] = None, n: int = 1, dist: str = "dirichlet") -> np.ndarray:
def random_weights(
dim: int, n: int = 1, dist: str = "dirichlet", seed: Optional[int] = None, rng: Optional[np.random.Generator] = None
) -> np.ndarray:
"""Generate random normalized weight vectors from a Gaussian or Dirichlet distribution alpha=1.
Args:
dim: size of the weight vector
seed: random seed
n : number of weight vectors to generate
dist: distribution to use, either 'gaussian' or 'dirichlet'. Default is 'dirichlet' as it is equivalent to sampling uniformly from the weight simplex.
seed: random seed
rng: random number generator
"""
if seed is not None:
if rng is None:
rng = np.random.default_rng(seed)
else:
rng = np.random

if dist == "gaussian":
w = np.random.randn(n, dim)
w = rng.standard_normal((n, dim))
w = np.abs(w) / np.linalg.norm(w, ord=1, axis=1, keepdims=True)
elif dist == "dirichlet":
w = rng.dirichlet(np.ones(dim), n)
Expand Down Expand Up @@ -319,6 +320,9 @@ def make_gif(env, agent, weight: np.ndarray, fullpath: str, fps: int = 50, lengt
def seed_everything(seed: int):
"""Set random seeds for reproducibility.
This function should be called only once per python process, preferably at the beginning of the main script.
It has global effects on the random state of the python process, so it should be used with care.
Args:
seed: random seed
"""
Expand Down
12 changes: 5 additions & 7 deletions morl_baselines/multi_policy/envelope/envelope.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
log_episode_info,
polyak_update,
random_weights,
seed_everything,
)


Expand Down Expand Up @@ -141,7 +140,7 @@ def __init__(
seed: The seed for the random number generator.
device: The device to use for training.
"""
MOAgent.__init__(self, env, device=device)
MOAgent.__init__(self, env, device=device, seed=seed)
MOPolicy.__init__(self, device)
self.learning_rate = learning_rate
self.initial_epsilon = initial_epsilon
Expand Down Expand Up @@ -191,9 +190,6 @@ def __init__(
action_dtype=np.uint8,
)

self.seed = seed
if self.seed is not None:
seed_everything(self.seed)
self.log = log
if log:
self.setup_wandb(project_name, experiment_name, wandb_entity)
Expand Down Expand Up @@ -282,7 +278,9 @@ def update(self):
) = self.__sample_batch_experiences()

sampled_w = (
th.tensor(random_weights(dim=self.reward_dim, n=self.num_sample_w, dist="gaussian")).float().to(self.device)
th.tensor(random_weights(dim=self.reward_dim, n=self.num_sample_w, dist="gaussian", rng=self.np_random))
.float()
.to(self.device)
) # sample num_sample_w random weights
w = sampled_w.repeat_interleave(b_obs.size(0), 0) # repeat the weights for each sample
b_obs, b_actions, b_rewards, b_next_obs, b_dones = (
Expand Down Expand Up @@ -376,7 +374,7 @@ def act(self, obs: th.Tensor, w: th.Tensor) -> int:
Returns: an integer representing the action to take.
"""
if np.random.random() < self.epsilon:
if self.np_random.random() < self.epsilon:
return self.env.action_space.sample()
else:
return self.max_action(obs, w)
Expand Down
8 changes: 2 additions & 6 deletions morl_baselines/multi_policy/gpi_pd/gpi_pd.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
log_all_multi_policy_metrics,
log_episode_info,
polyak_update,
seed_everything,
unique_tol,
)
from morl_baselines.multi_policy.linear_support.linear_support import LinearSupport
Expand Down Expand Up @@ -172,7 +171,7 @@ def __init__(
seed: The seed for random number generators.
device: The device to use.
"""
MOAgent.__init__(self, env, device=device)
MOAgent.__init__(self, env, device=device, seed=seed)
MOPolicy.__init__(self, device)
self.learning_rate = learning_rate
self.initial_epsilon = initial_epsilon
Expand Down Expand Up @@ -262,9 +261,6 @@ def __init__(
self.dynamics_uncertainty_threshold = dynamics_uncertainty_threshold
self.real_ratio = real_ratio

self.seed = seed
if self.seed is not None:
seed_everything(self.seed)
# logging
self.log = log
if self.log:
Expand Down Expand Up @@ -566,7 +562,7 @@ def eval(self, obs: np.ndarray, w: np.ndarray) -> int:
return action

def _act(self, obs: th.Tensor, w: th.Tensor) -> int:
if np.random.random() < self.epsilon:
if self.np_random.random() < self.epsilon:
return self.env.action_space.sample()
else:
if self.use_gpi:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
log_all_multi_policy_metrics,
log_episode_info,
polyak_update,
seed_everything,
unique_tol,
)
from morl_baselines.multi_policy.linear_support.linear_support import LinearSupport
Expand Down Expand Up @@ -161,7 +160,7 @@ def __init__(
seed (Optional[int], optional): The seed to use. Defaults to None.
device (Union[th.device, str], optional): The device to use for training. Defaults to "auto".
"""
MOAgent.__init__(self, env, device=device)
MOAgent.__init__(self, env, device=device, seed=seed)
MOPolicy.__init__(self, device)
self.learning_rate = learning_rate
self.tau = tau
Expand Down Expand Up @@ -241,10 +240,6 @@ def __init__(

self._n_updates = 0

self.seed = seed
if self.seed is not None:
seed_everything(self.seed)

self.log = log
if self.log:
self.setup_wandb(project_name, experiment_name, wandb_entity)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
equally_spaced_weights,
log_all_multi_policy_metrics,
random_weights,
seed_everything,
)
from morl_baselines.multi_policy.linear_support.linear_support import LinearSupport
from morl_baselines.single_policy.ser.mo_q_learning import MOQLearning
Expand Down Expand Up @@ -69,7 +68,7 @@ def __init__(
seed: The seed to use for reproducibility.
log: Whether to log or not.
"""
MOAgent.__init__(self, env)
MOAgent.__init__(self, env, seed=seed)
# Learning
self.scalarization = scalarization
self.learning_rate = learning_rate
Expand Down Expand Up @@ -97,11 +96,6 @@ def __init__(
self.experiment_name = experiment_name
self.log = log

# Seed
self.seed = seed
if self.seed is not None:
seed_everything(self.seed)

if self.log:
self.setup_wandb(project_name=self.project_name, experiment_name=self.experiment_name, entity=wandb_entity)
else:
Expand Down Expand Up @@ -195,7 +189,7 @@ def train(
rep_eval=num_eval_episodes_for_front,
)
elif self.weight_selection_algo == "random":
w = random_weights(self.reward_dim)
w = random_weights(self.reward_dim, rng=self.np_random)

new_agent = MOQLearning(
env=self.env,
Expand All @@ -211,6 +205,7 @@ def train(
dyna_updates=self.dyna_updates,
log=self.log,
parent_writer=self.writer,
parent_rng=self.np_random,
seed=self.seed,
)
if self.transfer_q_table and len(self.policies) > 0:
Expand All @@ -231,13 +226,14 @@ def train(

value = policy_evaluation_mo(agent=new_agent, env=eval_env, w=w, rep=num_eval_episodes_for_front)[3]
removed_inds = self.linear_support.add_solution(value, w)
self.delete_policies(removed_inds)
if self.weight_selection_algo != "random":
self.delete_policies(removed_inds)

if self.log:
if self.use_gpi_policy:
front = [
policy_evaluation_mo(agent=self, env=eval_env, w=w, rep=num_eval_episodes_for_front)[3]
for w in eval_weights
policy_evaluation_mo(agent=self, env=eval_env, w=w_eval, rep=num_eval_episodes_for_front)[3]
for w_eval in eval_weights
]
else:
front = self.linear_support.ccs
Expand Down
13 changes: 4 additions & 9 deletions morl_baselines/multi_policy/pareto_q_learning/pql.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from morl_baselines.common.utils import (
linearly_decaying_value,
log_all_multi_policy_metrics,
seed_everything,
)


Expand Down Expand Up @@ -50,7 +49,7 @@ def __init__(
wandb_entity: The wandb entity used for logging.
log: Whether to log or not.
"""
super().__init__(env)
super().__init__(env, seed=seed)
# Learning parameters
self.gamma = gamma
self.epsilon = initial_epsilon
Expand All @@ -59,10 +58,6 @@ def __init__(
self.final_epsilon = final_epsilon

# Algorithm setup
self.seed = seed
if self.seed is not None:
seed_everything(self.seed)
self.rng = np.random.default_rng(seed)
self.ref_point = ref_point

self.num_actions = self.env.action_space.n
Expand Down Expand Up @@ -159,11 +154,11 @@ def select_action(self, state: int, score_func: Callable):
Returns:
int: The selected action.
"""
if self.rng.uniform(0, 1) < self.epsilon:
return self.rng.integers(self.num_actions)
if self.np_random.uniform(0, 1) < self.epsilon:
return self.np_random.integers(self.num_actions)
else:
action_scores = score_func(state)
return self.rng.choice(np.argwhere(action_scores == np.max(action_scores)).flatten())
return self.np_random.choice(np.argwhere(action_scores == np.max(action_scores)).flatten())

def calc_non_dominated(self, state: int):
"""Get the non-dominated vectors in a given state.
Expand Down
20 changes: 8 additions & 12 deletions morl_baselines/multi_policy/pcn/pcn.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from morl_baselines.common.morl_algorithm import MOAgent, MOPolicy
from morl_baselines.common.pareto import get_non_dominated_inds
from morl_baselines.common.performance_indicators import hypervolume
from morl_baselines.common.utils import log_all_multi_policy_metrics, seed_everything
from morl_baselines.common.utils import log_all_multi_policy_metrics


def crowding_distance(points):
Expand Down Expand Up @@ -123,7 +123,7 @@ def __init__(
seed (Optional[int], optional): Seed for reproducibility. Defaults to None.
device (Union[th.device, str], optional): Device to use. Defaults to "auto".
"""
MOAgent.__init__(self, env, device=device)
MOAgent.__init__(self, env, device=device, seed=seed)
MOPolicy.__init__(self, device)

self.experience_replay = [] # List of (distance, time_step, transition)
Expand All @@ -140,10 +140,6 @@ def __init__(
).to(self.device)
self.opt = th.optim.Adam(self.model.parameters(), lr=self.learning_rate)

self.seed = seed
if self.seed is not None:
seed_everything(self.seed)

self.log = log
if log:
self.setup_wandb(project_name, experiment_name, wandb_entity)
Expand All @@ -164,13 +160,13 @@ def update(self):
"""Update PCN model."""
batch = []
# randomly choose episodes from experience buffer
s_i = np.random.choice(np.arange(len(self.experience_replay)), size=self.batch_size, replace=True)
s_i = self.np_random.choice(np.arange(len(self.experience_replay)), size=self.batch_size, replace=True)
for i in s_i:
# episode is tuple (return, transitions)
ep = self.experience_replay[i][2]
# choose random timestep from episode,
# use it's return and leftover timesteps as desired return and horizon
t = np.random.randint(0, len(ep))
t = self.np_random.integers(0, len(ep))
# reward contains return until end of episode
s_t, a_t, r_t, h_t = ep[t].observation, ep[t].action, np.float32(ep[t].reward), np.float32(len(ep) - t)
batch.append((s_t, a_t, r_t, h_t))
Expand Down Expand Up @@ -245,15 +241,15 @@ def _choose_commands(self, num_episodes: int):
returns = np.array(returns)[nd_i]
horizons = np.array(horizons)[nd_i]
# pick random return from random best episode
r_i = np.random.randint(0, len(returns))
r_i = self.np_random.integers(0, len(returns))
desired_horizon = np.float32(horizons[r_i] - 2)
# mean and std per objective
_, s = np.mean(returns, axis=0), np.std(returns, axis=0)
# desired return is sampled from [M, M+S], to try to do better than mean return
desired_return = returns[r_i].copy()
# random objective
r_i = np.random.randint(0, len(desired_return))
desired_return[r_i] += np.random.uniform(high=s[r_i])
r_i = self.np_random.integers(0, len(desired_return))
desired_return[r_i] += self.np_random.uniform(high=s[r_i])
desired_return = np.float32(desired_return)
return desired_return, desired_horizon

Expand All @@ -264,7 +260,7 @@ def _act(self, obs: np.ndarray, desired_return, desired_horizon) -> int:
th.tensor([desired_horizon]).unsqueeze(1).float().to(self.device),
)
log_probs = log_probs.detach().cpu().numpy()[0]
action = np.random.choice(np.arange(len(log_probs)), p=np.exp(log_probs))
action = self.np_random.choice(np.arange(len(log_probs)), p=np.exp(log_probs))
return action

def _run_episode(self, env, desired_return, desired_horizon, max_return):
Expand Down
Loading

0 comments on commit 0ebefc9

Please # to comment.