From 0ebefc91cc78406856da18461cc16fb369e5995e Mon Sep 17 00:00:00 2001 From: Lucas Alegre Date: Thu, 30 Mar 2023 10:49:47 +0200 Subject: [PATCH] Refactor random seed (#49) * Refactor random seed --- benchmark/launch_experiment.py | 3 +++ morl_baselines/common/morl_algorithm.py | 6 ++++-- morl_baselines/common/utils.py | 16 +++++++++------ .../multi_policy/envelope/envelope.py | 12 +++++------ morl_baselines/multi_policy/gpi_pd/gpi_pd.py | 8 ++------ .../gpi_pd/gpi_pd_continuous_action.py | 7 +------ .../mp_mo_q_learning.py | 18 +++++++---------- .../multi_policy/pareto_q_learning/pql.py | 13 ++++-------- morl_baselines/multi_policy/pcn/pcn.py | 20 ++++++++----------- morl_baselines/multi_policy/pgmorl/pgmorl.py | 12 ++++------- morl_baselines/single_policy/esr/eupg.py | 7 ++----- morl_baselines/single_policy/ser/mo_ppo.py | 8 +++++++- .../single_policy/ser/mo_q_learning.py | 16 +++++++-------- 13 files changed, 65 insertions(+), 81 deletions(-) diff --git a/benchmark/launch_experiment.py b/benchmark/launch_experiment.py index d0dd61b6..5674e7a8 100644 --- a/benchmark/launch_experiment.py +++ b/benchmark/launch_experiment.py @@ -16,6 +16,7 @@ import requests from mo_gymnasium.utils import MORecordEpisodeStatistics +from morl_baselines.common.utils import seed_everything from morl_baselines.multi_policy.envelope.envelope import Envelope from morl_baselines.multi_policy.gpi_pd.gpi_pd import GPIPD from morl_baselines.multi_policy.gpi_pd.gpi_pd_continuous_action import ( @@ -143,6 +144,8 @@ def main(): args = parse_args() print(args) + seed_everything(args.seed) + if args.auto_tag: if "WANDB_TAGS" in os.environ: raise ValueError( diff --git a/morl_baselines/common/morl_algorithm.py b/morl_baselines/common/morl_algorithm.py index df612031..00eed868 100644 --- a/morl_baselines/common/morl_algorithm.py +++ b/morl_baselines/common/morl_algorithm.py @@ -160,19 +160,21 @@ def update(self) -> None: class MOAgent(ABC): """An MORL Agent, can contain one or multiple MOPolicies. Contains helpers to extract features from the environment, setup logging etc.""" - def __init__(self, env: Optional[gym.Env], device: Union[th.device, str] = "auto") -> None: + def __init__(self, env: Optional[gym.Env], device: Union[th.device, str] = "auto", seed: Optional[int] = None) -> None: """Initializes the agent. Args: env: (gym.Env): The environment device: (str): The device to use for training. Can be "auto", "cpu" or "cuda". + seed: (int): The seed to use for the random number generator """ self.extract_env_info(env) self.device = th.device("cuda" if th.cuda.is_available() else "cpu") if device == "auto" else device self.global_step = 0 self.num_episodes = 0 - self.seed = None + self.seed = seed + self.np_random = np.random.default_rng(self.seed) def extract_env_info(self, env: Optional[gym.Env]) -> None: """Extracts all the features of the environment: observation space, action space, ... diff --git a/morl_baselines/common/utils.py b/morl_baselines/common/utils.py index 10299715..f1ebc085 100644 --- a/morl_baselines/common/utils.py +++ b/morl_baselines/common/utils.py @@ -151,22 +151,23 @@ def equally_spaced_weights(dim: int, n: int, seed: int = 42) -> List[np.ndarray] return list(get_reference_directions("energy", dim, n, seed=seed)) -def random_weights(dim: int, seed: Optional[int] = None, n: int = 1, dist: str = "dirichlet") -> np.ndarray: +def random_weights( + dim: int, n: int = 1, dist: str = "dirichlet", seed: Optional[int] = None, rng: Optional[np.random.Generator] = None +) -> np.ndarray: """Generate random normalized weight vectors from a Gaussian or Dirichlet distribution alpha=1. Args: dim: size of the weight vector - seed: random seed n : number of weight vectors to generate dist: distribution to use, either 'gaussian' or 'dirichlet'. Default is 'dirichlet' as it is equivalent to sampling uniformly from the weight simplex. + seed: random seed + rng: random number generator """ - if seed is not None: + if rng is None: rng = np.random.default_rng(seed) - else: - rng = np.random if dist == "gaussian": - w = np.random.randn(n, dim) + w = rng.standard_normal((n, dim)) w = np.abs(w) / np.linalg.norm(w, ord=1, axis=1, keepdims=True) elif dist == "dirichlet": w = rng.dirichlet(np.ones(dim), n) @@ -319,6 +320,9 @@ def make_gif(env, agent, weight: np.ndarray, fullpath: str, fps: int = 50, lengt def seed_everything(seed: int): """Set random seeds for reproducibility. + This function should be called only once per python process, preferably at the beginning of the main script. + It has global effects on the random state of the python process, so it should be used with care. + Args: seed: random seed """ diff --git a/morl_baselines/multi_policy/envelope/envelope.py b/morl_baselines/multi_policy/envelope/envelope.py index 84f9b361..784ef350 100644 --- a/morl_baselines/multi_policy/envelope/envelope.py +++ b/morl_baselines/multi_policy/envelope/envelope.py @@ -23,7 +23,6 @@ log_episode_info, polyak_update, random_weights, - seed_everything, ) @@ -141,7 +140,7 @@ def __init__( seed: The seed for the random number generator. device: The device to use for training. """ - MOAgent.__init__(self, env, device=device) + MOAgent.__init__(self, env, device=device, seed=seed) MOPolicy.__init__(self, device) self.learning_rate = learning_rate self.initial_epsilon = initial_epsilon @@ -191,9 +190,6 @@ def __init__( action_dtype=np.uint8, ) - self.seed = seed - if self.seed is not None: - seed_everything(self.seed) self.log = log if log: self.setup_wandb(project_name, experiment_name, wandb_entity) @@ -282,7 +278,9 @@ def update(self): ) = self.__sample_batch_experiences() sampled_w = ( - th.tensor(random_weights(dim=self.reward_dim, n=self.num_sample_w, dist="gaussian")).float().to(self.device) + th.tensor(random_weights(dim=self.reward_dim, n=self.num_sample_w, dist="gaussian", rng=self.np_random)) + .float() + .to(self.device) ) # sample num_sample_w random weights w = sampled_w.repeat_interleave(b_obs.size(0), 0) # repeat the weights for each sample b_obs, b_actions, b_rewards, b_next_obs, b_dones = ( @@ -376,7 +374,7 @@ def act(self, obs: th.Tensor, w: th.Tensor) -> int: Returns: an integer representing the action to take. """ - if np.random.random() < self.epsilon: + if self.np_random.random() < self.epsilon: return self.env.action_space.sample() else: return self.max_action(obs, w) diff --git a/morl_baselines/multi_policy/gpi_pd/gpi_pd.py b/morl_baselines/multi_policy/gpi_pd/gpi_pd.py index dacd63f9..9b24cc66 100644 --- a/morl_baselines/multi_policy/gpi_pd/gpi_pd.py +++ b/morl_baselines/multi_policy/gpi_pd/gpi_pd.py @@ -30,7 +30,6 @@ log_all_multi_policy_metrics, log_episode_info, polyak_update, - seed_everything, unique_tol, ) from morl_baselines.multi_policy.linear_support.linear_support import LinearSupport @@ -172,7 +171,7 @@ def __init__( seed: The seed for random number generators. device: The device to use. """ - MOAgent.__init__(self, env, device=device) + MOAgent.__init__(self, env, device=device, seed=seed) MOPolicy.__init__(self, device) self.learning_rate = learning_rate self.initial_epsilon = initial_epsilon @@ -262,9 +261,6 @@ def __init__( self.dynamics_uncertainty_threshold = dynamics_uncertainty_threshold self.real_ratio = real_ratio - self.seed = seed - if self.seed is not None: - seed_everything(self.seed) # logging self.log = log if self.log: @@ -566,7 +562,7 @@ def eval(self, obs: np.ndarray, w: np.ndarray) -> int: return action def _act(self, obs: th.Tensor, w: th.Tensor) -> int: - if np.random.random() < self.epsilon: + if self.np_random.random() < self.epsilon: return self.env.action_space.sample() else: if self.use_gpi: diff --git a/morl_baselines/multi_policy/gpi_pd/gpi_pd_continuous_action.py b/morl_baselines/multi_policy/gpi_pd/gpi_pd_continuous_action.py index b95d526d..f6b3b3ef 100644 --- a/morl_baselines/multi_policy/gpi_pd/gpi_pd_continuous_action.py +++ b/morl_baselines/multi_policy/gpi_pd/gpi_pd_continuous_action.py @@ -27,7 +27,6 @@ log_all_multi_policy_metrics, log_episode_info, polyak_update, - seed_everything, unique_tol, ) from morl_baselines.multi_policy.linear_support.linear_support import LinearSupport @@ -161,7 +160,7 @@ def __init__( seed (Optional[int], optional): The seed to use. Defaults to None. device (Union[th.device, str], optional): The device to use for training. Defaults to "auto". """ - MOAgent.__init__(self, env, device=device) + MOAgent.__init__(self, env, device=device, seed=seed) MOPolicy.__init__(self, device) self.learning_rate = learning_rate self.tau = tau @@ -241,10 +240,6 @@ def __init__( self._n_updates = 0 - self.seed = seed - if self.seed is not None: - seed_everything(self.seed) - self.log = log if self.log: self.setup_wandb(project_name, experiment_name, wandb_entity) diff --git a/morl_baselines/multi_policy/multi_policy_moqlearning/mp_mo_q_learning.py b/morl_baselines/multi_policy/multi_policy_moqlearning/mp_mo_q_learning.py index f78c8451..f60159e1 100644 --- a/morl_baselines/multi_policy/multi_policy_moqlearning/mp_mo_q_learning.py +++ b/morl_baselines/multi_policy/multi_policy_moqlearning/mp_mo_q_learning.py @@ -14,7 +14,6 @@ equally_spaced_weights, log_all_multi_policy_metrics, random_weights, - seed_everything, ) from morl_baselines.multi_policy.linear_support.linear_support import LinearSupport from morl_baselines.single_policy.ser.mo_q_learning import MOQLearning @@ -69,7 +68,7 @@ def __init__( seed: The seed to use for reproducibility. log: Whether to log or not. """ - MOAgent.__init__(self, env) + MOAgent.__init__(self, env, seed=seed) # Learning self.scalarization = scalarization self.learning_rate = learning_rate @@ -97,11 +96,6 @@ def __init__( self.experiment_name = experiment_name self.log = log - # Seed - self.seed = seed - if self.seed is not None: - seed_everything(self.seed) - if self.log: self.setup_wandb(project_name=self.project_name, experiment_name=self.experiment_name, entity=wandb_entity) else: @@ -195,7 +189,7 @@ def train( rep_eval=num_eval_episodes_for_front, ) elif self.weight_selection_algo == "random": - w = random_weights(self.reward_dim) + w = random_weights(self.reward_dim, rng=self.np_random) new_agent = MOQLearning( env=self.env, @@ -211,6 +205,7 @@ def train( dyna_updates=self.dyna_updates, log=self.log, parent_writer=self.writer, + parent_rng=self.np_random, seed=self.seed, ) if self.transfer_q_table and len(self.policies) > 0: @@ -231,13 +226,14 @@ def train( value = policy_evaluation_mo(agent=new_agent, env=eval_env, w=w, rep=num_eval_episodes_for_front)[3] removed_inds = self.linear_support.add_solution(value, w) - self.delete_policies(removed_inds) + if self.weight_selection_algo != "random": + self.delete_policies(removed_inds) if self.log: if self.use_gpi_policy: front = [ - policy_evaluation_mo(agent=self, env=eval_env, w=w, rep=num_eval_episodes_for_front)[3] - for w in eval_weights + policy_evaluation_mo(agent=self, env=eval_env, w=w_eval, rep=num_eval_episodes_for_front)[3] + for w_eval in eval_weights ] else: front = self.linear_support.ccs diff --git a/morl_baselines/multi_policy/pareto_q_learning/pql.py b/morl_baselines/multi_policy/pareto_q_learning/pql.py index 1580d5c2..90db1c08 100644 --- a/morl_baselines/multi_policy/pareto_q_learning/pql.py +++ b/morl_baselines/multi_policy/pareto_q_learning/pql.py @@ -10,7 +10,6 @@ from morl_baselines.common.utils import ( linearly_decaying_value, log_all_multi_policy_metrics, - seed_everything, ) @@ -50,7 +49,7 @@ def __init__( wandb_entity: The wandb entity used for logging. log: Whether to log or not. """ - super().__init__(env) + super().__init__(env, seed=seed) # Learning parameters self.gamma = gamma self.epsilon = initial_epsilon @@ -59,10 +58,6 @@ def __init__( self.final_epsilon = final_epsilon # Algorithm setup - self.seed = seed - if self.seed is not None: - seed_everything(self.seed) - self.rng = np.random.default_rng(seed) self.ref_point = ref_point self.num_actions = self.env.action_space.n @@ -159,11 +154,11 @@ def select_action(self, state: int, score_func: Callable): Returns: int: The selected action. """ - if self.rng.uniform(0, 1) < self.epsilon: - return self.rng.integers(self.num_actions) + if self.np_random.uniform(0, 1) < self.epsilon: + return self.np_random.integers(self.num_actions) else: action_scores = score_func(state) - return self.rng.choice(np.argwhere(action_scores == np.max(action_scores)).flatten()) + return self.np_random.choice(np.argwhere(action_scores == np.max(action_scores)).flatten()) def calc_non_dominated(self, state: int): """Get the non-dominated vectors in a given state. diff --git a/morl_baselines/multi_policy/pcn/pcn.py b/morl_baselines/multi_policy/pcn/pcn.py index 3030c0ac..3be79dc7 100644 --- a/morl_baselines/multi_policy/pcn/pcn.py +++ b/morl_baselines/multi_policy/pcn/pcn.py @@ -13,7 +13,7 @@ from morl_baselines.common.morl_algorithm import MOAgent, MOPolicy from morl_baselines.common.pareto import get_non_dominated_inds from morl_baselines.common.performance_indicators import hypervolume -from morl_baselines.common.utils import log_all_multi_policy_metrics, seed_everything +from morl_baselines.common.utils import log_all_multi_policy_metrics def crowding_distance(points): @@ -123,7 +123,7 @@ def __init__( seed (Optional[int], optional): Seed for reproducibility. Defaults to None. device (Union[th.device, str], optional): Device to use. Defaults to "auto". """ - MOAgent.__init__(self, env, device=device) + MOAgent.__init__(self, env, device=device, seed=seed) MOPolicy.__init__(self, device) self.experience_replay = [] # List of (distance, time_step, transition) @@ -140,10 +140,6 @@ def __init__( ).to(self.device) self.opt = th.optim.Adam(self.model.parameters(), lr=self.learning_rate) - self.seed = seed - if self.seed is not None: - seed_everything(self.seed) - self.log = log if log: self.setup_wandb(project_name, experiment_name, wandb_entity) @@ -164,13 +160,13 @@ def update(self): """Update PCN model.""" batch = [] # randomly choose episodes from experience buffer - s_i = np.random.choice(np.arange(len(self.experience_replay)), size=self.batch_size, replace=True) + s_i = self.np_random.choice(np.arange(len(self.experience_replay)), size=self.batch_size, replace=True) for i in s_i: # episode is tuple (return, transitions) ep = self.experience_replay[i][2] # choose random timestep from episode, # use it's return and leftover timesteps as desired return and horizon - t = np.random.randint(0, len(ep)) + t = self.np_random.integers(0, len(ep)) # reward contains return until end of episode s_t, a_t, r_t, h_t = ep[t].observation, ep[t].action, np.float32(ep[t].reward), np.float32(len(ep) - t) batch.append((s_t, a_t, r_t, h_t)) @@ -245,15 +241,15 @@ def _choose_commands(self, num_episodes: int): returns = np.array(returns)[nd_i] horizons = np.array(horizons)[nd_i] # pick random return from random best episode - r_i = np.random.randint(0, len(returns)) + r_i = self.np_random.integers(0, len(returns)) desired_horizon = np.float32(horizons[r_i] - 2) # mean and std per objective _, s = np.mean(returns, axis=0), np.std(returns, axis=0) # desired return is sampled from [M, M+S], to try to do better than mean return desired_return = returns[r_i].copy() # random objective - r_i = np.random.randint(0, len(desired_return)) - desired_return[r_i] += np.random.uniform(high=s[r_i]) + r_i = self.np_random.integers(0, len(desired_return)) + desired_return[r_i] += self.np_random.uniform(high=s[r_i]) desired_return = np.float32(desired_return) return desired_return, desired_horizon @@ -264,7 +260,7 @@ def _act(self, obs: np.ndarray, desired_return, desired_horizon) -> int: th.tensor([desired_horizon]).unsqueeze(1).float().to(self.device), ) log_probs = log_probs.detach().cpu().numpy()[0] - action = np.random.choice(np.arange(len(log_probs)), p=np.exp(log_probs)) + action = self.np_random.choice(np.arange(len(log_probs)), p=np.exp(log_probs)) return action def _run_episode(self, env, desired_return, desired_horizon, max_return): diff --git a/morl_baselines/multi_policy/pgmorl/pgmorl.py b/morl_baselines/multi_policy/pgmorl/pgmorl.py index 427b024f..d0fd5b3e 100644 --- a/morl_baselines/multi_policy/pgmorl/pgmorl.py +++ b/morl_baselines/multi_policy/pgmorl/pgmorl.py @@ -18,7 +18,7 @@ from morl_baselines.common.morl_algorithm import MOAgent from morl_baselines.common.pareto import ParetoArchive from morl_baselines.common.performance_indicators import hypervolume, sparsity -from morl_baselines.common.utils import log_all_multi_policy_metrics, seed_everything +from morl_baselines.common.utils import log_all_multi_policy_metrics from morl_baselines.single_policy.ser.mo_ppo import MOPPO, MOPPONet, make_env @@ -364,7 +364,7 @@ def __init__( gae_lambda: lambda parameter for GAE device: device on which the code should run """ - super().__init__(env, device=device) + super().__init__(env, device=device, seed=seed) # Env dimensions self.tmp_env = mo_gym.make(env_id) self.extract_env_info(self.tmp_env) @@ -411,11 +411,6 @@ def __init__( self.gae_lambda = gae_lambda self.gae = gae - # seeding - self.seed = seed - if self.seed is not None: - seed_everything(self.seed) - # env setup if env is None: if self.seed is not None: @@ -470,6 +465,7 @@ def __init__( target_kl=self.target_kl, gae=self.gae, gae_lambda=self.gae_lambda, + rng=self.np_random, ) for i in range(self.pop_size) ] @@ -547,7 +543,7 @@ def __eval_all_agents( def __task_weight_selection(self, ref_point: np.ndarray): """Chooses agents and weights to train at the next iteration based on the current population and prediction model.""" candidate_weights = generate_weights(self.delta_weight / 2.0) # Generates more weights than agents - np.random.shuffle(candidate_weights) # Randomize + self.np_random.shuffle(candidate_weights) # Randomize current_front = deepcopy(self.archive.evaluations) population = self.population.individuals diff --git a/morl_baselines/single_policy/esr/eupg.py b/morl_baselines/single_policy/esr/eupg.py index b10c7883..03f3c0a1 100644 --- a/morl_baselines/single_policy/esr/eupg.py +++ b/morl_baselines/single_policy/esr/eupg.py @@ -12,7 +12,7 @@ from morl_baselines.common.accrued_reward_buffer import AccruedRewardReplayBuffer from morl_baselines.common.morl_algorithm import MOAgent, MOPolicy from morl_baselines.common.networks import mlp -from morl_baselines.common.utils import layer_init, log_episode_info, seed_everything +from morl_baselines.common.utils import layer_init, log_episode_info class PolicyNet(nn.Module): @@ -109,16 +109,13 @@ def __init__( device: Device to use for NN. Can be "cpu", "cuda" or "auto". seed: Seed for the random number generator """ - MOAgent.__init__(self, env, device) + MOAgent.__init__(self, env, device, seed=seed) MOPolicy.__init__(self, None, device) self.env = env # RL self.scalarization = scalarization self.gamma = gamma - self.seed = seed - if self.seed is not None: - seed_everything(self.seed) # Learning self.buffer_size = buffer_size diff --git a/morl_baselines/single_policy/ser/mo_ppo.py b/morl_baselines/single_policy/ser/mo_ppo.py index d9ac7dd9..25cc2caf 100644 --- a/morl_baselines/single_policy/ser/mo_ppo.py +++ b/morl_baselines/single_policy/ser/mo_ppo.py @@ -261,6 +261,7 @@ def __init__( gae_lambda: float = 0.95, device: Union[th.device, str] = "auto", seed: int = 42, + rng: Optional[np.random.Generator] = None, ): """Multi-objective PPO. @@ -287,6 +288,7 @@ def __init__( gae_lambda: GAE lambda device: Device to use seed: Random seed + rng: Random number generator """ super().__init__(id, device) self.id = id @@ -295,6 +297,10 @@ def __init__( self.networks = networks self.device = device self.seed = seed + if rng is not None: + self.np_random = rng + else: + self.np_random = np.random.default_rng(self.seed) # PPO Parameters self.steps_per_iteration = steps_per_iteration @@ -492,7 +498,7 @@ def update(self): clipfracs = [] # Perform multiple passes on the batch (that is shuffled every time) for epoch in range(self.update_epochs): - np.random.shuffle(b_inds) + self.np_random.shuffle(b_inds) for start in range(0, self.batch_size, self.minibatch_size): end = start + self.minibatch_size # mb == minibatch diff --git a/morl_baselines/single_policy/ser/mo_q_learning.py b/morl_baselines/single_policy/ser/mo_q_learning.py index 345e58db..f96d53fd 100644 --- a/morl_baselines/single_policy/ser/mo_q_learning.py +++ b/morl_baselines/single_policy/ser/mo_q_learning.py @@ -10,11 +10,7 @@ from morl_baselines.common.model_based.tabular_model import TabularModel from morl_baselines.common.morl_algorithm import MOAgent, MOPolicy from morl_baselines.common.scalarization import weighted_sum -from morl_baselines.common.utils import ( - linearly_decaying_value, - log_episode_info, - seed_everything, -) +from morl_baselines.common.utils import linearly_decaying_value, log_episode_info class MOQLearning(MOPolicy, MOAgent): @@ -44,6 +40,7 @@ def __init__( log: bool = True, seed: Optional[int] = None, parent_writer: Optional[SummaryWriter] = None, + parent_rng: Optional[np.random.Generator] = None, ): """Initializes the MOQ-learning algorithm. @@ -66,14 +63,17 @@ def __init__( log: Whether to log or not seed: The seed to use for the experiment parent_writer: The writer to use for logging. If None, a new writer is created. + parent_rng: The random number generator to use. If None, a new one is created. """ MOAgent.__init__(self, env) MOPolicy.__init__(self, id) self.learning_rate = learning_rate self.id = id self.seed = seed - if self.seed is not None: - seed_everything(self.seed) + if parent_rng is not None: + self.np_random = parent_rng + else: + self.np_random = np.random.default_rng(self.seed) if self.id is not None: self.idstr = f"_{self.id}" @@ -104,7 +104,7 @@ def __init__( def __act(self, obs: np.array): # epsilon-greedy - coin = np.random.rand() + coin = self.np_random.random() if coin < self.epsilon: return int(self.env.action_space.sample()) else: