diff --git a/.github/workflows/build-publish.yml b/.github/workflows/build-publish.yml index cba96336..c944eeee 100644 --- a/.github/workflows/build-publish.yml +++ b/.github/workflows/build-publish.yml @@ -4,7 +4,7 @@ # - https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/ # # derived from https://github.com/Farama-Foundation/PettingZoo/blob/e230f4d80a5df3baf9bd905149f6d4e8ce22be31/.github/workflows/build-publish.yml -name: build-publish +name: Build artifact for PyPI on: push: @@ -16,35 +16,18 @@ on: jobs: build-wheels: - runs-on: ${{ matrix.os }} - strategy: - matrix: - include: - - os: ubuntu-latest - python: 38 - platform: manylinux_x86_64 - - os: ubuntu-latest - python: 39 - platform: manylinux_x86_64 - - os: ubuntu-latest - python: 310 - platform: manylinux_x86_64 - - os: ubuntu-latest - python: 311 - platform: manylinux_x86_64 + runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.x' + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + - name: Install dependencies - run: python -m pip install --upgrade pip setuptools build + run: pipx install build - name: Build sdist and wheels - run: python -m build + run: pyproject-build - name: Store wheels - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: path: dist @@ -55,10 +38,11 @@ jobs: if: github.event_name == 'release' && github.event.action == 'published' steps: - name: Download dists - uses: actions/download-artifact@v4.1.7 + uses: actions/download-artifact@v4 with: name: artifact path: dist + - name: Publish uses: pypa/gh-action-pypi-publish@release/v1 with: diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 80ce02af..9f2cc2ab 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -13,9 +13,7 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - - run: python -m pip install pre-commit - - run: python -m pre_commit --version - - run: python -m pre_commit install - - run: python -m pre_commit run --all-files + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + - run: pipx install pre-commit + - run: pre-commit run --all-files diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 69680e4e..af334549 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ # See https://pre-commit.com/hooks.html for more hooks repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v5.0.0 hooks: - id: check-symlinks - id: destroyed-symlinks @@ -18,13 +18,13 @@ repos: - id: detect-private-key - id: debug-statements - repo: https://github.com/codespell-project/codespell - rev: v2.2.4 + rev: v2.3.0 hooks: - id: codespell args: - --ignore-words-list=reacher,ure,referenc,wile,mor,ser,esr,nowe - repo: https://github.com/PyCQA/flake8 - rev: 6.0.0 + rev: 7.1.1 hooks: - id: flake8 args: @@ -35,16 +35,16 @@ repos: - --show-source - --statistics - repo: https://github.com/asottile/pyupgrade - rev: v3.3.1 + rev: v3.18.0 hooks: - id: pyupgrade args: ["--py37-plus"] - repo: https://github.com/PyCQA/isort - rev: 5.12.0 + rev: 5.13.2 hooks: - id: isort - repo: https://github.com/python/black - rev: 23.1.0 + rev: 24.10.0 hooks: - id: black - repo: https://github.com/pycqa/pydocstyle diff --git a/examples/envelope_minecart.py b/examples/envelope_minecart.py index bf41f0f6..c3f43bf4 100644 --- a/examples/envelope_minecart.py +++ b/examples/envelope_minecart.py @@ -1,6 +1,6 @@ import mo_gymnasium as mo_gym import numpy as np -from mo_gymnasium.utils import MORecordEpisodeStatistics +from mo_gymnasium.wrappers import MORecordEpisodeStatistics from morl_baselines.multi_policy.envelope.envelope import Envelope diff --git a/examples/eupg_fishwood.py b/examples/eupg_fishwood.py index 7b253522..55a77583 100644 --- a/examples/eupg_fishwood.py +++ b/examples/eupg_fishwood.py @@ -1,7 +1,7 @@ import mo_gymnasium as mo_gym import numpy as np import torch as th -from mo_gymnasium.utils import MORecordEpisodeStatistics +from mo_gymnasium.wrappers import MORecordEpisodeStatistics from morl_baselines.common.evaluation import eval_mo_reward_conditioned from morl_baselines.single_policy.esr.eupg import EUPG diff --git a/examples/mo_q_learning_DST.py b/examples/mo_q_learning_DST.py index ab234086..8910519e 100644 --- a/examples/mo_q_learning_DST.py +++ b/examples/mo_q_learning_DST.py @@ -2,7 +2,7 @@ import mo_gymnasium as mo_gym import numpy as np -from mo_gymnasium.utils import MORecordEpisodeStatistics +from mo_gymnasium.wrappers import MORecordEpisodeStatistics from morl_baselines.common.evaluation import eval_mo from morl_baselines.common.scalarization import tchebicheff diff --git a/examples/mp_mo_q_learning_DST.py b/examples/mp_mo_q_learning_DST.py index 89977c3f..a6d418b6 100644 --- a/examples/mp_mo_q_learning_DST.py +++ b/examples/mp_mo_q_learning_DST.py @@ -1,6 +1,6 @@ import mo_gymnasium as mo_gym import numpy as np -from mo_gymnasium import MORecordEpisodeStatistics +from mo_gymnasium.wrappers import MORecordEpisodeStatistics from morl_baselines.common.scalarization import tchebicheff from morl_baselines.multi_policy.multi_policy_moqlearning.mp_mo_q_learning import ( diff --git a/examples/pcn_minecart.py b/examples/pcn_minecart.py index aabc577f..363fcd5f 100644 --- a/examples/pcn_minecart.py +++ b/examples/pcn_minecart.py @@ -1,6 +1,6 @@ import mo_gymnasium as mo_gym import numpy as np -from mo_gymnasium.utils import MORecordEpisodeStatistics +from mo_gymnasium.wrappers import MORecordEpisodeStatistics from morl_baselines.multi_policy.pcn.pcn import PCN diff --git a/examples/pgmorl_halfcheetah.py b/examples/pgmorl_halfcheetah.py index 5b54ed5b..2b5bd5f0 100644 --- a/examples/pgmorl_halfcheetah.py +++ b/examples/pgmorl_halfcheetah.py @@ -19,7 +19,7 @@ algo.train( total_timesteps=int(5e6), eval_env=make_env(env_id, 42, 0, "PGMORL_eval_env", gamma=0.995)(), - ref_point=np.array([0.0, -5.0]), + ref_point=np.array([-100.0, -100.0]), known_pareto_front=None, ) env = make_env(env_id, 422, 1, "PGMORL_test", gamma=0.995)() # idx != 0 to avoid taking videos diff --git a/experiments/benchmark/launch_experiment.py b/experiments/benchmark/launch_experiment.py index 4b093b20..cf1a1b6d 100644 --- a/experiments/benchmark/launch_experiment.py +++ b/experiments/benchmark/launch_experiment.py @@ -15,9 +15,8 @@ import numpy as np import requests from gym_super_mario_bros.actions import SIMPLE_MOVEMENT -from gymnasium.wrappers import FlattenObservation -from gymnasium.wrappers.record_video import RecordVideo -from mo_gymnasium.utils import MORecordEpisodeStatistics +from gymnasium.wrappers import FlattenObservation, RecordVideo +from mo_gymnasium.wrappers import MORecordEpisodeStatistics from morl_baselines.common.evaluation import seed_everything from morl_baselines.common.experiments import ( @@ -90,13 +89,15 @@ def autotag() -> str: git_commit = subprocess.check_output(["git", "rev-parse", "--verify", "HEAD"]).decode("ascii").strip() try: # try finding the pull request number on github - prs = requests.get(f"https://api.github.com/search/issues?q=repo:LucasAlegre/morl-baselines+is:pr+{git_commit}") + prs = requests.get( + f"https://api.github.com/search/issues?q=repo:LucasAlegre/morl-baselines+is:pr+{git_commit}" # noqa + ) if prs.status_code == 200: prs = prs.json() if len(prs["items"]) > 0: pr = prs["items"][0] pr_number = pr["number"] - wandb_tag += f",pr-{pr_number}" + wandb_tag += f",pr-{pr_number}" # noqa print(f"identified github pull request: {pr_number}") except Exception as e: print(e) @@ -165,7 +166,7 @@ def wrap_mario(env): TimeLimit, ) from mo_gymnasium.envs.mario.joypad_space import JoypadSpace - from mo_gymnasium.utils import MOMaxAndSkipObservation + from mo_gymnasium.wrappers import MOMaxAndSkipObservation env = JoypadSpace(env, SIMPLE_MOVEMENT) env = MOMaxAndSkipObservation(env, skip=4) diff --git a/experiments/hyperparameter_search/launch_sweep.py b/experiments/hyperparameter_search/launch_sweep.py index de48d782..e6fedb44 100644 --- a/experiments/hyperparameter_search/launch_sweep.py +++ b/experiments/hyperparameter_search/launch_sweep.py @@ -7,7 +7,7 @@ import numpy as np import wandb import yaml -from mo_gymnasium.utils import MORecordEpisodeStatistics +from mo_gymnasium.wrappers import MORecordEpisodeStatistics from morl_baselines.common.evaluation import seed_everything from morl_baselines.common.experiments import ( diff --git a/morl_baselines/__init__.py b/morl_baselines/__init__.py index 498632ab..d2af2963 100644 --- a/morl_baselines/__init__.py +++ b/morl_baselines/__init__.py @@ -1,4 +1,3 @@ """MORL-Baselines contains various MORL algorithms and utility functions.""" - -__version__ = "1.0.0" +__version__ = "1.1.0" diff --git a/morl_baselines/common/buffer.py b/morl_baselines/common/buffer.py index 58b53397..ee3722b7 100644 --- a/morl_baselines/common/buffer.py +++ b/morl_baselines/common/buffer.py @@ -1,4 +1,5 @@ """Replay buffer for multi-objective reinforcement learning.""" + import numpy as np import torch as th diff --git a/morl_baselines/common/diverse_buffer.py b/morl_baselines/common/diverse_buffer.py index a0d19c85..e49330fb 100644 --- a/morl_baselines/common/diverse_buffer.py +++ b/morl_baselines/common/diverse_buffer.py @@ -1,4 +1,5 @@ """Diverse Experience Replay Buffer. Code extracted from https://github.com/axelabels/DynMORL.""" + from dataclasses import dataclass import numpy as np @@ -154,7 +155,7 @@ def update(self, idx: int, p, tree_id=None): Keyword Arguments: tree_id {object} -- Tree to be updated (default: {None}) """ - if type(p) == dict: + if isinstance(p, dict): for k in p: self.update(idx, p[k], k) return @@ -476,7 +477,10 @@ def get_data(self, include_indices: bool = False): Returns: The data """ - all_data = list(np.arange(self.capacity) + self.capacity - 1), list(self.tree.data) + all_data = ( + list(np.arange(self.capacity) + self.capacity - 1), + list(self.tree.data), + ) indices = [] data = [] for i, d in zip(all_data[0], all_data[1]): diff --git a/morl_baselines/common/evaluation.py b/morl_baselines/common/evaluation.py index 79b6eff3..af106c08 100644 --- a/morl_baselines/common/evaluation.py +++ b/morl_baselines/common/evaluation.py @@ -1,4 +1,5 @@ """Utilities related to evaluation.""" + import os import random from typing import List, Optional, Tuple diff --git a/morl_baselines/common/experiments.py b/morl_baselines/common/experiments.py index dff6237c..71dda225 100644 --- a/morl_baselines/common/experiments.py +++ b/morl_baselines/common/experiments.py @@ -1,4 +1,5 @@ """Common experiment utilities.""" + import argparse from morl_baselines.multi_policy.capql.capql import CAPQL diff --git a/morl_baselines/common/model_based/probabilistic_ensemble.py b/morl_baselines/common/model_based/probabilistic_ensemble.py index db40b5f7..f5333468 100644 --- a/morl_baselines/common/model_based/probabilistic_ensemble.py +++ b/morl_baselines/common/model_based/probabilistic_ensemble.py @@ -1,4 +1,5 @@ """Probabilistic ensemble of neural networks.""" + import os import numpy as np diff --git a/morl_baselines/common/model_based/tabular_model.py b/morl_baselines/common/model_based/tabular_model.py index 96664800..903429e7 100644 --- a/morl_baselines/common/model_based/tabular_model.py +++ b/morl_baselines/common/model_based/tabular_model.py @@ -1,4 +1,5 @@ """Tabular dynamics model S_{t+1}, R_t ~ m(.,.|s,a) .""" + import random import numpy as np diff --git a/morl_baselines/common/model_based/utils.py b/morl_baselines/common/model_based/utils.py index 10d58467..1d13fa0d 100644 --- a/morl_baselines/common/model_based/utils.py +++ b/morl_baselines/common/model_based/utils.py @@ -1,4 +1,5 @@ """Utility functions for the model.""" + from typing import Tuple import matplotlib.pyplot as plt @@ -34,7 +35,7 @@ def termination_fn_dst(obs, act, next_obs): def termination_fn_mountaincar(obs, act, next_obs): - """Termination function of mountin car.""" + """Termination function of mountain car.""" assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 position = next_obs[:, 0] velocity = next_obs[:, 1] @@ -147,16 +148,29 @@ def step( var_obs = var_obs[0] var_rewards = var_rewards[0] - info = {"uncertainty": uncertainties, "var_obs": var_obs, "var_rewards": var_rewards} + info = { + "uncertainty": uncertainties, + "var_obs": var_obs, + "var_rewards": var_rewards, + } # info = {'mean': return_means, 'std': return_stds, 'log_prob': log_prob, 'dev': dev} return next_obs, rewards, terminals, info def visualize_eval( - agent, env, model=None, w=None, horizon=10, init_obs=None, compound=True, deterministic=False, show=False, filename=None + agent, + env, + model=None, + w=None, + horizon=10, + init_obs=None, + compound=True, + deterministic=False, + show=False, + filename=None, ): - """Generates a plot of the evolution of the state, reward and model predicitions ove time. + """Generates a plot of the evolution of the state, reward and model predictions over time. Args: agent: agent to be evaluated @@ -213,10 +227,16 @@ def visualize_eval( acts = F.one_hot(acts, num_classes=env.action_space.n).squeeze(1) for step in range(len(real_obs)): if compound or step == 0: - obs, r, done, info = model_env.step(th.tensor(obs).to(agent.device), acts[step], deterministic=deterministic) + obs, r, done, info = model_env.step( + th.tensor(obs).to(agent.device), + acts[step], + deterministic=deterministic, + ) else: obs, r, done, info = model_env.step( - th.tensor(real_obs[step - 1]).to(agent.device), acts[step], deterministic=deterministic + th.tensor(real_obs[step - 1]).to(agent.device), + acts[step], + deterministic=deterministic, ) model_obs.append(obs.copy()) model_obs_stds.append(np.sqrt(info["var_obs"].copy())) @@ -240,11 +260,26 @@ def visualize_eval( axs[i].set_ylabel(f"Reward {i - obs_dim}") axs[i].grid(alpha=0.25) if w is not None: - axs[i].plot(x, [real_vec_rewards[step][i - obs_dim] for step in x], label="Environment", color="black") + axs[i].plot( + x, + [real_vec_rewards[step][i - obs_dim] for step in x], + label="Environment", + color="black", + ) else: - axs[i].plot(x, [real_rewards[step] for step in x], label="Environment", color="black") + axs[i].plot( + x, + [real_rewards[step] for step in x], + label="Environment", + color="black", + ) if model is not None: - axs[i].plot(x, [model_rewards[step][i - obs_dim] for step in x], label="Model", color="blue") + axs[i].plot( + x, + [model_rewards[step][i - obs_dim] for step in x], + label="Model", + color="blue", + ) axs[i].fill_between( x, [model_rewards[step][i - obs_dim] + model_rewards_stds[step][i - obs_dim] for step in x], diff --git a/morl_baselines/common/morl_algorithm.py b/morl_baselines/common/morl_algorithm.py index b7af6cd5..920a5342 100644 --- a/morl_baselines/common/morl_algorithm.py +++ b/morl_baselines/common/morl_algorithm.py @@ -1,4 +1,5 @@ """MORL algorithm base classes.""" + import os import time from abc import ABC, abstractmethod @@ -11,7 +12,7 @@ import torch.nn import wandb from gymnasium import spaces -from mo_gymnasium.utils import MOSyncVectorEnv +from mo_gymnasium.wrappers.vector import MOSyncVectorEnv from morl_baselines.common.evaluation import ( eval_mo_reward_conditioned, diff --git a/morl_baselines/common/pareto.py b/morl_baselines/common/pareto.py index 76a64254..63828bd1 100644 --- a/morl_baselines/common/pareto.py +++ b/morl_baselines/common/pareto.py @@ -1,4 +1,5 @@ """Pareto utilities.""" + from copy import deepcopy from typing import List, Union diff --git a/morl_baselines/common/performance_indicators.py b/morl_baselines/common/performance_indicators.py index 3d957f1b..8462dbb3 100644 --- a/morl_baselines/common/performance_indicators.py +++ b/morl_baselines/common/performance_indicators.py @@ -2,6 +2,7 @@ We mostly rely on pymoo for the computation of axiomatic indicators (HV and IGD), but some are customly made. """ + from copy import deepcopy from typing import Callable, List diff --git a/morl_baselines/common/prioritized_buffer.py b/morl_baselines/common/prioritized_buffer.py index 99ba8b84..24662b76 100644 --- a/morl_baselines/common/prioritized_buffer.py +++ b/morl_baselines/common/prioritized_buffer.py @@ -2,6 +2,7 @@ Code adapted from https://github.com/sfujim/LAP-PAL """ + import numpy as np import torch as th diff --git a/morl_baselines/common/scalarization.py b/morl_baselines/common/scalarization.py index 3fd2ffc2..a8a7f7f5 100644 --- a/morl_baselines/common/scalarization.py +++ b/morl_baselines/common/scalarization.py @@ -1,4 +1,5 @@ """Scalarization functions relying on numpy.""" + import numpy as np from pymoo.decomposition.tchebicheff import Tchebicheff diff --git a/morl_baselines/common/utils.py b/morl_baselines/common/utils.py index 88157a2e..00c01d8e 100644 --- a/morl_baselines/common/utils.py +++ b/morl_baselines/common/utils.py @@ -1,4 +1,5 @@ """General utils for the MORL baselines.""" + import math import os from typing import Callable, List diff --git a/morl_baselines/multi_policy/capql/capql.py b/morl_baselines/multi_policy/capql/capql.py index 54ecd625..1ae46bdc 100644 --- a/morl_baselines/multi_policy/capql/capql.py +++ b/morl_baselines/multi_policy/capql/capql.py @@ -1,4 +1,5 @@ """CAPQL algorithm.""" + import os import random from itertools import chain diff --git a/morl_baselines/multi_policy/envelope/envelope.py b/morl_baselines/multi_policy/envelope/envelope.py index 6899c585..c4ad7903 100644 --- a/morl_baselines/multi_policy/envelope/envelope.py +++ b/morl_baselines/multi_policy/envelope/envelope.py @@ -1,4 +1,5 @@ """Envelope Q-Learning implementation.""" + import os from typing import List, Optional, Union from typing_extensions import override diff --git a/morl_baselines/multi_policy/gpi_pd/gpi_pd.py b/morl_baselines/multi_policy/gpi_pd/gpi_pd.py index 9d315a85..8903288b 100644 --- a/morl_baselines/multi_policy/gpi_pd/gpi_pd.py +++ b/morl_baselines/multi_policy/gpi_pd/gpi_pd.py @@ -1,4 +1,5 @@ """GPI-PD algorithm.""" + import os import random from itertools import chain diff --git a/morl_baselines/multi_policy/gpi_pd/gpi_pd_continuous_action.py b/morl_baselines/multi_policy/gpi_pd/gpi_pd_continuous_action.py index a9e31fbd..37e9b165 100644 --- a/morl_baselines/multi_policy/gpi_pd/gpi_pd_continuous_action.py +++ b/morl_baselines/multi_policy/gpi_pd/gpi_pd_continuous_action.py @@ -1,4 +1,5 @@ """GPI-PD algorithm with continuous actions.""" + import os import random from itertools import chain diff --git a/morl_baselines/multi_policy/linear_support/linear_support.py b/morl_baselines/multi_policy/linear_support/linear_support.py index a9d6dda1..12df3f92 100644 --- a/morl_baselines/multi_policy/linear_support/linear_support.py +++ b/morl_baselines/multi_policy/linear_support/linear_support.py @@ -1,4 +1,5 @@ """Linear Support implementation.""" + import random from copy import deepcopy from typing import List, Optional diff --git a/morl_baselines/multi_policy/morld/morld.py b/morl_baselines/multi_policy/morld/morld.py index c931efc7..69c0ee3c 100644 --- a/morl_baselines/multi_policy/morld/morld.py +++ b/morl_baselines/multi_policy/morld/morld.py @@ -2,6 +2,7 @@ See Felten, Talbi & Danoy (2024): https://arxiv.org/abs/2311.12495. """ + import math import time from typing import Callable, List, Optional, Tuple, Union @@ -10,7 +11,7 @@ import gymnasium as gym import numpy as np import torch as th -from mo_gymnasium import MONormalizeReward +from mo_gymnasium.wrappers import MONormalizeReward from torch import optim from morl_baselines.common.evaluation import log_all_multi_policy_metrics diff --git a/morl_baselines/multi_policy/multi_policy_moqlearning/mp_mo_q_learning.py b/morl_baselines/multi_policy/multi_policy_moqlearning/mp_mo_q_learning.py index 232e9b51..0341e08e 100644 --- a/morl_baselines/multi_policy/multi_policy_moqlearning/mp_mo_q_learning.py +++ b/morl_baselines/multi_policy/multi_policy_moqlearning/mp_mo_q_learning.py @@ -1,4 +1,5 @@ """Outer-loop MOQ-learning algorithm (uses multiple weights).""" + import time from copy import deepcopy from typing import List, Optional diff --git a/morl_baselines/multi_policy/pareto_q_learning/pql.py b/morl_baselines/multi_policy/pareto_q_learning/pql.py index 45077aa5..2b315dd7 100644 --- a/morl_baselines/multi_policy/pareto_q_learning/pql.py +++ b/morl_baselines/multi_policy/pareto_q_learning/pql.py @@ -1,4 +1,5 @@ """Pareto Q-Learning.""" + import numbers from typing import Callable, List, Optional @@ -60,19 +61,19 @@ def __init__( # Algorithm setup self.ref_point = ref_point - if type(self.env.action_space) == gym.spaces.Discrete: + if isinstance(self.env.action_space, gym.spaces.Discrete): self.num_actions = self.env.action_space.n - elif type(self.env.action_space) == gym.spaces.MultiDiscrete: + elif isinstance(self.env.action_space, gym.spaces.MultiDiscrete): self.num_actions = np.prod(self.env.action_space.nvec) else: raise Exception("PQL only supports (multi)discrete action spaces.") - if type(self.env.observation_space) == gym.spaces.Discrete: + if isinstance(self.env.observation_space, gym.spaces.Discrete): self.env_shape = (self.env.observation_space.n,) - elif type(self.env.observation_space) == gym.spaces.MultiDiscrete: + elif isinstance(self.env.observation_space, gym.spaces.MultiDiscrete): self.env_shape = self.env.observation_space.nvec elif ( - type(self.env.observation_space) == gym.spaces.Box + isinstance(self.env.observation_space, gym.spaces.Box) and self.env.observation_space.is_bounded(manner="both") and issubclass(self.env.observation_space.dtype.type, numbers.Integral) ): @@ -83,7 +84,7 @@ def __init__( raise Exception("PQL only supports discretizable observation spaces.") self.num_states = np.prod(self.env_shape) - self.num_objectives = self.env.reward_space.shape[0] + self.num_objectives = self.env.unwrapped.reward_space.shape[0] self.counts = np.zeros((self.num_states, self.num_actions)) self.non_dominated = [ [{tuple(np.zeros(self.num_objectives))} for _ in range(self.num_actions)] for _ in range(self.num_states) @@ -96,7 +97,11 @@ def __init__( self.log = log if self.log: - self.setup_wandb(project_name=self.project_name, experiment_name=self.experiment_name, entity=wandb_entity) + self.setup_wandb( + project_name=self.project_name, + experiment_name=self.experiment_name, + entity=wandb_entity, + ) def get_config(self) -> dict: """Get the configuration dictionary. diff --git a/morl_baselines/multi_policy/pcn/pcn.py b/morl_baselines/multi_policy/pcn/pcn.py index 2e380024..48c162b9 100644 --- a/morl_baselines/multi_policy/pcn/pcn.py +++ b/morl_baselines/multi_policy/pcn/pcn.py @@ -1,4 +1,5 @@ """Pareto Conditioned Network. Code adapted from https://github.com/mathieu-reymond/pareto-conditioned-networks .""" + import heapq import os from abc import ABC diff --git a/morl_baselines/multi_policy/pgmorl/pgmorl.py b/morl_baselines/multi_policy/pgmorl/pgmorl.py index 02c77fcf..4ca3aef8 100644 --- a/morl_baselines/multi_policy/pgmorl/pgmorl.py +++ b/morl_baselines/multi_policy/pgmorl/pgmorl.py @@ -4,6 +4,7 @@ (!) Limited to 2 objectives for now. (!) The post-processing phase has not been implemented yet. """ + import time from copy import deepcopy from typing import List, Optional, Tuple, Union @@ -420,7 +421,7 @@ def __init__( envs = [make_env(env_id, self.seed + i, i, experiment_name, self.gamma) for i in range(self.num_envs)] else: envs = [make_env(env_id, i, i, experiment_name, self.gamma) for i in range(self.num_envs)] - self.env = mo_gym.MOSyncVectorEnv(envs) + self.env = mo_gym.wrappers.vector.MOSyncVectorEnv(envs) else: raise ValueError("Environments should be vectorized for PPO. You should provide an environment id instead.") @@ -506,7 +507,9 @@ def get_config(self) -> dict: def __train_all_agents(self, iteration: int, max_iterations: int): for i, agent in enumerate(self.agents): + agent.global_step = self.global_step agent.train(self.start_time, iteration, max_iterations) + self.global_step += self.steps_per_iteration * self.num_envs def __eval_all_agents( self, @@ -631,7 +634,9 @@ def train( } ) self.num_eval_weights_for_eval = num_eval_weights_for_eval - max_iterations = total_timesteps // self.steps_per_iteration // self.num_envs + # 1 iteration is a full batch for each agents + # -> (steps_per_iteration * num_envs * pop_size) timesteps per iteration + max_iterations = total_timesteps // self.steps_per_iteration // self.num_envs // self.pop_size iteration = 0 # Init current_evaluations = [np.zeros(self.reward_dim) for _ in range(len(self.agents))] @@ -646,7 +651,7 @@ def train( # Warmup for i in range(1, self.warmup_iterations + 1): - print(f"Warmup iteration #{iteration}") + print(f"Warmup iteration #{iteration}, global step: {self.global_step}") if self.log: wandb.log({"charts/warmup_iterations": i, "global_step": self.global_step}) self.__train_all_agents(iteration=iteration, max_iterations=max_iterations) diff --git a/morl_baselines/single_policy/esr/eupg.py b/morl_baselines/single_policy/esr/eupg.py index 9eaa9d89..efd417e1 100644 --- a/morl_baselines/single_policy/esr/eupg.py +++ b/morl_baselines/single_policy/esr/eupg.py @@ -1,4 +1,5 @@ """EUPG is an ESR algorithm based on Policy Gradient (REINFORCE like).""" + import time from copy import deepcopy from typing import Callable, List, Optional, Union diff --git a/morl_baselines/single_policy/ser/mo_ppo.py b/morl_baselines/single_policy/ser/mo_ppo.py index 09385534..6c0cf84a 100644 --- a/morl_baselines/single_policy/ser/mo_ppo.py +++ b/morl_baselines/single_policy/ser/mo_ppo.py @@ -1,4 +1,5 @@ """Multi-Objective PPO Algorithm.""" + import time from copy import deepcopy from typing import List, Optional, Union @@ -9,7 +10,7 @@ import numpy as np import torch as th import wandb -from mo_gymnasium import MORecordEpisodeStatistics +from mo_gymnasium.wrappers import MORecordEpisodeStatistics from torch import nn, optim from torch.distributions import Normal @@ -122,7 +123,7 @@ def thunk(): env = mo_gym.make(env_id, render_mode="rgb_array") else: env = mo_gym.make(env_id) - reward_dim = env.reward_space.shape[0] + reward_dim = env.unwrapped.reward_space.shape[0] """ if idx == 0: env = gym.wrappers.RecordVideo( env, @@ -131,10 +132,10 @@ def thunk(): ) """ env = gym.wrappers.ClipAction(env) env = gym.wrappers.NormalizeObservation(env) - env = gym.wrappers.TransformObservation(env, lambda obs: np.clip(obs, -10, 10)) + env = gym.wrappers.TransformObservation(env, lambda obs: np.clip(obs, -10, 10), env.observation_space) for o in range(reward_dim): - env = mo_gym.utils.MONormalizeReward(env, idx=o, gamma=gamma) - env = mo_gym.utils.MOClipReward(env, idx=o, min_r=-10, max_r=10) + env = mo_gym.wrappers.MONormalizeReward(env, idx=o, gamma=gamma) + env = mo_gym.wrappers.MOClipReward(env, idx=o, min_r=-10, max_r=10) env = MORecordEpisodeStatistics(env, gamma=gamma) env.reset(seed=seed) env.action_space.seed(seed) @@ -404,7 +405,7 @@ def __collect_samples(self, obs: th.Tensor, done: th.Tensor): value = value.view(self.num_envs, self.networks.reward_dim) # Perform action on the environment - next_obs, reward, next_terminated, _, info = self.envs.step(action.cpu().numpy()) + next_obs, reward, next_terminated, next_truncated, info = self.envs.step(action.cpu().numpy()) reward = th.tensor(reward).to(self.device).view(self.num_envs, self.networks.reward_dim) # storing to batch self.batch.add(obs, action, logprob, reward, done, value) @@ -413,16 +414,19 @@ def __collect_samples(self, obs: th.Tensor, done: th.Tensor): obs, done = th.Tensor(next_obs).to(self.device), th.Tensor(next_terminated).to(self.device) # Episode info logging - if "episode" in info.keys(): - for item in info["episode"]: + if self.log and "episode" in info.keys(): + indices = np.where(next_terminated | next_truncated)[0] + for idx in indices: + # Reconstructs the dict by extracting the relevant information for each vectorized env + info_log = {k: v[idx] for k, v in info["episode"].items()} + log_episode_info( - item, + info_log, scalarization=np.dot, weights=self.weights, global_timestep=self.global_step, id=self.id, ) - break return obs, done @@ -603,6 +607,7 @@ def train(self, start_time, current_iteration: int, max_iterations: int): # Logging print("SPS:", int(self.global_step / (time.time() - start_time))) if self.log: + print(f"Worker {self.id} - Global step: {self.global_step}") wandb.log( {"charts/SPS": int(self.global_step / (time.time() - start_time)), "global_step": self.global_step}, ) diff --git a/morl_baselines/single_policy/ser/mo_q_learning.py b/morl_baselines/single_policy/ser/mo_q_learning.py index 5abe72d3..1061fcc2 100644 --- a/morl_baselines/single_policy/ser/mo_q_learning.py +++ b/morl_baselines/single_policy/ser/mo_q_learning.py @@ -1,4 +1,5 @@ """Scalarized Q-learning for single policy multi-objective reinforcement learning.""" + import time from typing import Optional from typing_extensions import override diff --git a/pyproject.toml b/pyproject.toml index 362d1037..09a544d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,8 +21,8 @@ classifiers = [ 'Topic :: Scientific/Engineering :: Artificial Intelligence', ] dependencies = [ - "mo-gymnasium >=1.0.1", - "gymnasium>=0.28.1,<0.30", + "mo-gymnasium >=1.1.0", + "gymnasium>=1.0.0", "numpy >=1.21.0,<2.0.0", "torch >=1.12.0", "pygame >=2.1.0", @@ -40,12 +40,12 @@ dynamic = ["version"] [project.optional-dependencies] # Update dependencies in `all` if any are added or removed # OLS requires pycddlib and libgmp to be installed, which does not work on MacOS for now. -ols = ["pycddlib"] -gpi = ["pycddlib"] +ols = ["pycddlib==2.1.6"] +gpi = ["pycddlib==2.1.6"] all = [ # OLS & GPI - "pycddlib", + "pycddlib==2.1.6", ] testing = ["pytest ==7.1.3"] @@ -66,7 +66,6 @@ morl_baselines = ["*.json", "assets/*"] # Linting, testing, ... ######################################################## [tool.black] -safe = true line-length = 127 target-version = ['py38', 'py39', 'py310'] include = '\.pyi?$' diff --git a/tests/test_algos.py b/tests/test_algos.py index d64174e7..584b704c 100644 --- a/tests/test_algos.py +++ b/tests/test_algos.py @@ -1,4 +1,5 @@ """Mostly tests to make sure the algorithms are able to run.""" + import time import mo_gymnasium as mo_gym