Skip to content

Commit

Permalink
Clean fluid APIs in distributed and fleet files (#48851)
Browse files Browse the repository at this point in the history
* Fix bug of reduce_sum op. When input.numel() > INT32_MAX, its result
is wrong.

* Remove climits.

* Clean fluid API in paddle/distributed and paddle/fleetx folders.
Include following files:
python/paddle/distributed/__init__.py
python/paddle/distributed/collective.py
python/paddle/distributed/fleet/utils/fs.py
python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
python/paddle/distributed/fleet/utils/internal_storage.py
python/paddle/distributed/launch/context/device.py
python/paddle/distributed/parallel.py
python/paddle/distributed/parallel_with_gloo.py
python/paddle/distributed/spawn.py
python/paddle/framework/__init__.py
To be mentioned, 'paddle.fluid.dygraph.parallel.ParallelEnv'
 and 'fluid.framework.core' keeps unchanged in those files.
ParallelEnv is used by paddle.fluid.dygraph.parallel.DataParallel.
However, APIs in paddle.fluid.dygraph.parallel can't be
migrated to paddle.distributed, as there exists cyclic import
dependencies in modules like paddle.static, paddle.tensor. And
'fluid.framework.core' will be changed to import framework.core
after fluid.core is transmitted.

* Change TODO authors.
  • Loading branch information
GhostScreaming authored Dec 8, 2022
1 parent ea9e408 commit 911d6bb
Show file tree
Hide file tree
Showing 11 changed files with 61 additions and 28 deletions.
3 changes: 3 additions & 0 deletions python/paddle/distributed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@
from .entry_attr import CountFilterEntry # noqa: F401
from .entry_attr import ShowClickEntry # noqa: F401

# (TODO: GhostScreaming) It needs migration of ParallelEnv. However,
# it's hard to migrate APIs in paddle.fluid.dygraph.parallel completely.
# It will be replaced later.
from paddle.fluid.dygraph.parallel import ParallelEnv # noqa: F401

from . import cloud_utils # noqa: F401
Expand Down
4 changes: 3 additions & 1 deletion python/paddle/distributed/collective.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
import datetime

import paddle

# (TODO: GhostScreaming) It will be removed later.
import paddle.fluid.core as core
from paddle.framework import _non_static_mode, in_dygraph_mode

from ..fluid.framework import _non_static_mode, in_dygraph_mode
from .communication.group import Group, _add_new_group, is_initialized
from .fleet.layers.mpu.mp_ops import _c_concat # noqa: F401
from .fleet.layers.mpu.mp_ops import _c_identity # noqa: F401
Expand Down
1 change: 1 addition & 0 deletions python/paddle/distributed/fleet/utils/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import shutil
import time

# (TODO: GhostScreaming) It will be removed later.
from paddle.fluid import core

from .log_util import logger
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@
import numpy as np

import paddle.distributed.fleet as fleet

# (TODO: GhostScreaming) It will be removed later.
import paddle.fluid.core as core
from paddle.fluid.framework import Block, Program, _non_static_mode
from paddle.framework import Block, Program, _non_static_mode


class HybridParallelInferenceHelper:
Expand Down
7 changes: 5 additions & 2 deletions python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,16 @@

import paddle
from paddle import framework

# (TODO: GhostScreaming) It will be removed later.
from paddle.fluid import core
from paddle.fluid.dygraph.parallel import (
from paddle.framework import (
_in_legacy_dygraph,
_split_tensors,
build_groups,
in_dygraph_mode,
sync_params_buffers,
)
from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode

from .log_util import logger

Expand Down
22 changes: 12 additions & 10 deletions python/paddle/distributed/fleet/utils/internal_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
import numpy as np

import paddle
import paddle.fluid as fluid
from paddle import framework

# (TODO: GhostScreaming) It will be removed later.
from paddle.fluid import core

from ..meta_parallel.sharding.sharding_utils import Type, device_guard
Expand Down Expand Up @@ -111,7 +113,7 @@ def to(self, device, dtype=None, keep_alignment=True):
if keep_alignment:
self._array_params()

@fluid.dygraph.no_grad
@framework.no_grad()
def add_rank_params(self, trainable_params, param2align, convert_gpu=True):
"""
Add new parameters to the InternalStorage. Params becomes a view of this InternalStorage buffer.
Expand Down Expand Up @@ -145,7 +147,7 @@ def add_rank_params(self, trainable_params, param2align, convert_gpu=True):
self._params.append(param)
self._param_ids.append(id(param))

@fluid.dygraph.no_grad
@framework.no_grad()
def _add_param_as_view(self, param, align, convert_gpu=True):

assert (
Expand Down Expand Up @@ -185,7 +187,7 @@ def _add_param_as_view(self, param, align, convert_gpu=True):
self._fill = offset
return p_shape

@fluid.dygraph.no_grad
@framework.no_grad()
def _convert_buffer(self, param, p_shape, align):

var_end = self._fill + np.prod(p_shape)
Expand All @@ -199,7 +201,7 @@ def _convert_buffer(self, param, p_shape, align):

self._fill = offset

@fluid.dygraph.no_grad
@framework.no_grad()
def _array_params(self):
"""
Given the parameters which have been registered previously, rebuild the whole InternalStorage.
Expand Down Expand Up @@ -261,7 +263,7 @@ def to(self, device, dtype=None, keep_alignment=True):
if keep_alignment:
self._array_grads()

@fluid.dygraph.no_grad
@framework.no_grad()
def add_grad(self, param, align):
"""
Add a new parameter gradient to the InternalStorage. Param.grad becomes a view of this InternalStorage buffer.
Expand All @@ -275,7 +277,7 @@ def add_grad(self, param, align):
self._params.append(param)
self._param_ids.append(id(param))

@fluid.dygraph.no_grad
@framework.no_grad()
def manumal_relase(self):
"""
Release the buffer from InternalStorage. The InternalStorage will need to be rebuilt before use.
Expand All @@ -291,7 +293,7 @@ def manumal_relase(self):
self.params_checked_in = 0
self._release = True

@fluid.dygraph.no_grad
@framework.no_grad()
def rebuild(self):
"""
Given the parameter gradients which have been registered previously, rebuild the whole InternalStorage.
Expand All @@ -305,7 +307,7 @@ def rebuild(self):

self._release = False

@fluid.dygraph.no_grad
@framework.no_grad()
def _array_grads(self):
"""
Given the parameters gradients which have been registered previously, rebuild the whole InternalStorage.
Expand All @@ -315,7 +317,7 @@ def _array_grads(self):
for p in self._params:
self._add_grad_as_view(p, self._parm2align[p.name])

@fluid.dygraph.no_grad
@framework.no_grad()
def _add_grad_as_view(self, param, align):
assert (
np.prod(self.buffer.shape) > 0
Expand Down
24 changes: 13 additions & 11 deletions python/paddle/distributed/launch/context/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@

import os

import paddle.fluid as fluid
from paddle.device import get_available_custom_device

# (TODO: GhostScreaming) It will be removed later.
from paddle.fluid import core


class DeviceType:
CPU = 'cpu'
Expand Down Expand Up @@ -148,25 +150,25 @@ def get_custom_devices_count(device_type):
)
if visible_devices_str in os.environ:
visible_devices = os.getenv(visible_devices_str)
elif fluid.core.is_compiled_with_cuda():
elif core.is_compiled_with_cuda():
dev._dtype = DeviceType.GPU
num = fluid.core.get_cuda_device_count()
num = core.get_cuda_device_count()
visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
elif fluid.core.is_compiled_with_xpu():
elif core.is_compiled_with_xpu():
dev._dtype = DeviceType.XPU
num = fluid.core.get_xpu_device_count()
num = core.get_xpu_device_count()
visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
elif fluid.core.is_compiled_with_npu():
elif core.is_compiled_with_npu():
dev._dtype = DeviceType.NPU
num = fluid.core.get_npu_device_count()
num = core.get_npu_device_count()
visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
elif fluid.core.is_compiled_with_mlu():
elif core.is_compiled_with_mlu():
dev._dtype = DeviceType.MLU
num = fluid.core.get_mlu_device_count()
num = core.get_mlu_device_count()
visible_devices = os.getenv("MLU_VISIBLE_DEVICES")
elif fluid.core.is_compiled_with_ipu():
elif core.is_compiled_with_ipu():
dev._dtype = DeviceType.IPU
num = fluid.core.get_ipu_device_count()
num = core.get_ipu_device_count()
# For IPUs, 'labels' is a list which contains the available numbers of IPU devices.
dev._labels = [str(x) for x in range(0, num + 1)]
return dev
Expand Down
12 changes: 10 additions & 2 deletions python/paddle/distributed/parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,18 @@
from paddle.distributed.fleet.launch_utils import check_backend

# deprecated module import
# (TODO: GhostScreaming) It will be removed later.
from paddle.fluid import core
from paddle.fluid.dygraph import parallel_helper

# (TODO: GhostScreaming) It will be removed later.
from paddle.fluid.dygraph.parallel import ParallelEnv
from paddle.fluid.framework import _set_expected_place, in_dygraph_mode

# (TODO: GhostScreaming) It will be removed later.
from paddle.framework import (
_set_expected_place,
in_dygraph_mode,
parallel_helper,
)

__all__ = []

Expand Down
1 change: 1 addition & 0 deletions python/paddle/distributed/parallel_with_gloo.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
)

# deprecated module import
# (TODO: GhostScreaming) It will be removed later.
from paddle.fluid import core

__all__ = []
Expand Down
3 changes: 2 additions & 1 deletion python/paddle/distributed/spawn.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,9 @@
)

# deprecated module import
# (TODO: GhostScreaming) It will be removed later.
from paddle.fluid import core
from paddle.fluid.framework import set_flags
from paddle.framework import set_flags

__all__ = []

Expand Down
8 changes: 8 additions & 0 deletions python/paddle/framework/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,5 +68,13 @@
from ..fluid.framework import _global_flags # noqa: F401
from ..fluid.framework import _apply_pass # noqa: F401
from ..fluid.framework import switch_main_program
from ..fluid.framework import _set_expected_place # noqa: F401
from ..fluid.framework import Block, Program # noqa: F401
from ..fluid.dygraph import parallel_helper # noqa: F401
from ..fluid.dygraph.parallel import (
_split_tensors,
build_groups,
sync_params_buffers,
)

__all__ = []

0 comments on commit 911d6bb

Please # to comment.