Skip to content

Internal Batching Improvements #333

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Closed
wants to merge 12 commits into from
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
@@ -43,7 +43,7 @@ commands:
steps:
- run:
name: "Check import order with isort"
command: isort --check-only
command: isort --check-only -v
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Switching this to verbose to have more details on incorrect orderings in CircleCI


mypy_check:
description: "Static type checking with mypy"
2 changes: 1 addition & 1 deletion .isort.cfg
Original file line number Diff line number Diff line change
@@ -4,4 +4,4 @@ include_trailing_comma=True
force_grid_wrap=0
use_parentheses=True
line_length=88
known_third_party=pytext,torchvision,bs4
known_third_party=pytext,torchvision,bs4,torch
86 changes: 62 additions & 24 deletions captum/attr/_core/integrated_gradients.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
import typing
from typing import Any, Callable, Tuple, Union
from typing import Any, Callable, List, Tuple, Union

import torch
from torch import Tensor
@@ -19,7 +19,7 @@
)
from .._utils.approximation_methods import approximation_parameters
from .._utils.attribution import GradientAttribution
from .._utils.batching import _batched_operator
from .._utils.batching import _batch_attribution
from .._utils.common import (
_format_attributions,
_format_input_baseline,
@@ -196,9 +196,10 @@ def attribute( # type: ignore
`riemann_trapezoid` or `gausslegendre`.
Default: `gausslegendre` if no method is provided.
internal_batch_size (int, optional): Divides total #steps * #examples
data points into chunks of size internal_batch_size,
data points into chunks of size at most internal_batch_size,
which are computed (forward / backward passes)
sequentially.
sequentially. internal_batch_size must be at least equal to
#examples.
For DataParallel models, each batch is split among the
available devices, so evaluations on each available
device contain internal_batch_size / num_devices examples.
@@ -248,9 +249,59 @@ def attribute( # type: ignore

_validate_input(inputs, baselines, n_steps, method)

# retrieve step size and scaling factor for specified approximation method
step_sizes_func, alphas_func = approximation_parameters(method)
step_sizes, alphas = step_sizes_func(n_steps), alphas_func(n_steps)
if internal_batch_size is not None:
num_examples = inputs[0].shape[0]
attributions = _batch_attribution(
self,
num_examples,
internal_batch_size,
n_steps,
inputs=inputs,
baselines=baselines,
target=target,
additional_forward_args=additional_forward_args,
method=method,
)
else:
attributions = self._attribute(
inputs=inputs,
baselines=baselines,
target=target,
additional_forward_args=additional_forward_args,
n_steps=n_steps,
method=method,
)

if return_convergence_delta:
start_point, end_point = baselines, inputs
# computes approximation error based on the completeness axiom
delta = self.compute_convergence_delta(
attributions,
start_point,
end_point,
additional_forward_args=additional_forward_args,
target=target,
)
return _format_attributions(is_inputs_tuple, attributions), delta
return _format_attributions(is_inputs_tuple, attributions)

def _attribute(
self,
inputs: Tuple[Tensor, ...],
baselines: Tuple[Union[Tensor, int, float], ...],
target: TargetType = None,
additional_forward_args: Any = None,
n_steps: int = 50,
method: str = "gausslegendre",
step_sizes_and_alphas: Union[None, Tuple[List[float], List[float]]] = None,
) -> Tuple[Tensor, ...]:
if step_sizes_and_alphas is None:
# retrieve step size and scaling factor for specified
# approximation method
step_sizes_func, alphas_func = approximation_parameters(method)
step_sizes, alphas = step_sizes_func(n_steps), alphas_func(n_steps)
else:
step_sizes, alphas = step_sizes_and_alphas

# scale features and compute gradients. (batch size is abbreviated as bsz)
# scaled_features' dim -> (bsz * #steps x inputs[0].shape[1:], ...)
@@ -277,13 +328,11 @@ def attribute( # type: ignore
expanded_target = _expand_target(target, n_steps)

# grads: dim -> (bsz * #steps x inputs[0].shape[1:], ...)
grads = _batched_operator(
self.gradient_func,
scaled_features_tpl,
input_additional_args,
internal_batch_size=internal_batch_size,
grads = self.gradient_func(
forward_fn=self.forward_func,
inputs=scaled_features_tpl,
target_ind=expanded_target,
additional_forward_args=input_additional_args,
)

# flattening grads so that we can multilpy it with step-size
@@ -309,18 +358,7 @@ def attribute( # type: ignore
total_grad * (input - baseline)
for total_grad, input, baseline in zip(total_grads, inputs, baselines)
)
if return_convergence_delta:
start_point, end_point = baselines, inputs
# computes approximation error based on the completeness axiom
delta = self.compute_convergence_delta(
attributions,
start_point,
end_point,
additional_forward_args=additional_forward_args,
target=target,
)
return _format_attributions(is_inputs_tuple, attributions), delta
return _format_attributions(is_inputs_tuple, attributions)
return attributions

def has_convergence_delta(self) -> bool:
return True
61 changes: 50 additions & 11 deletions captum/attr/_core/layer/internal_influence.py
Original file line number Diff line number Diff line change
@@ -13,7 +13,7 @@
from ...._utils.typing import BaselineType, TargetType
from ..._utils.approximation_methods import approximation_parameters
from ..._utils.attribution import GradientAttribution, LayerAttribution
from ..._utils.batching import _batched_operator
from ..._utils.batching import _batch_attribution
from ..._utils.common import (
_format_attributions,
_format_input_baseline,
@@ -163,9 +163,10 @@ def attribute(
`riemann_trapezoid` or `gausslegendre`.
Default: `gausslegendre` if no method is provided.
internal_batch_size (int, optional): Divides total #steps * #examples
data points into chunks of size internal_batch_size,
data points into chunks of size at most internal_batch_size,
which are computed (forward / backward passes)
sequentially.
sequentially. internal_batch_size must be at least equal to
#examples.
For DataParallel models, each batch is split among the
available devices, so evaluations on each available
device contain internal_batch_size / num_devices examples.
@@ -213,10 +214,50 @@ def attribute(
"""
inputs, baselines = _format_input_baseline(inputs, baselines)
_validate_input(inputs, baselines, n_steps, method)
if internal_batch_size is not None:
num_examples = inputs[0].shape[0]
attrs = _batch_attribution(
self,
num_examples,
internal_batch_size,
n_steps,
inputs=inputs,
baselines=baselines,
target=target,
additional_forward_args=additional_forward_args,
method=method,
attribute_to_layer_input=attribute_to_layer_input,
)
else:
attrs = self._attribute(
inputs=inputs,
baselines=baselines,
target=target,
additional_forward_args=additional_forward_args,
n_steps=n_steps,
method=method,
attribute_to_layer_input=attribute_to_layer_input,
)

return attrs

# Retrieve step size and scaling factor for specified approximation method
step_sizes_func, alphas_func = approximation_parameters(method)
step_sizes, alphas = step_sizes_func(n_steps), alphas_func(n_steps)
def _attribute(
self,
inputs: Tuple[Tensor, ...],
baselines: Tuple[Union[Tensor, int, float], ...],
target: TargetType = None,
additional_forward_args: Any = None,
n_steps: int = 50,
method: str = "gausslegendre",
attribute_to_layer_input: bool = False,
step_sizes_and_alphas: Union[None, Tuple[List[float], List[float]]] = None,
) -> Union[Tensor, Tuple[Tensor, ...]]:
if step_sizes_and_alphas is None:
# retrieve step size and scaling factor for specified approximation method
step_sizes_func, alphas_func = approximation_parameters(method)
step_sizes, alphas = step_sizes_func(n_steps), alphas_func(n_steps)
else:
step_sizes, alphas = step_sizes_and_alphas

# Compute scaled inputs from baseline to final input.
scaled_features_tpl = tuple(
@@ -242,14 +283,12 @@ def attribute(
expanded_target = _expand_target(target, n_steps)

# Returns gradient of output with respect to hidden layer.
layer_gradients, _, is_layer_tuple = _batched_operator(
compute_layer_gradients_and_eval,
scaled_features_tpl,
input_additional_args,
internal_batch_size=internal_batch_size,
layer_gradients, _, is_layer_tuple = compute_layer_gradients_and_eval(
forward_fn=self.forward_func,
layer=self.layer,
inputs=scaled_features_tpl,
target_ind=expanded_target,
additional_forward_args=input_additional_args,
device_ids=self.device_ids,
attribute_to_layer_input=attribute_to_layer_input,
)
91 changes: 69 additions & 22 deletions captum/attr/_core/layer/layer_conductance.py
Original file line number Diff line number Diff line change
@@ -14,7 +14,7 @@
from ...._utils.typing import BaselineType, Literal, TargetType
from ..._utils.approximation_methods import approximation_parameters
from ..._utils.attribution import GradientAttribution, LayerAttribution
from ..._utils.batching import _batched_operator
from ..._utils.batching import _batch_attribution
from ..._utils.common import (
_format_attributions,
_format_input_baseline,
@@ -204,9 +204,10 @@ def attribute(
`riemann_trapezoid` or `gausslegendre`.
Default: `gausslegendre` if no method is provided.
internal_batch_size (int, optional): Divides total #steps * #examples
data points into chunks of size internal_batch_size,
data points into chunks of size at most internal_batch_size,
which are computed (forward / backward passes)
sequentially.
sequentially. internal_batch_size must be at least equal to
2 * #examples.
For DataParallel models, each batch is split among the
available devices, so evaluations on each available
device contain internal_batch_size / num_devices examples.
@@ -271,11 +272,66 @@ def attribute(
_validate_input(inputs, baselines, n_steps, method)

num_examples = inputs[0].shape[0]
if internal_batch_size is not None:
num_examples = inputs[0].shape[0]
attrs = _batch_attribution(
self,
num_examples,
internal_batch_size,
n_steps + 1,
include_endpoint=True,
inputs=inputs,
baselines=baselines,
target=target,
additional_forward_args=additional_forward_args,
method=method,
attribute_to_layer_input=attribute_to_layer_input,
)

else:
attrs = self._attribute(
inputs=inputs,
baselines=baselines,
target=target,
additional_forward_args=additional_forward_args,
n_steps=n_steps,
method=method,
attribute_to_layer_input=attribute_to_layer_input,
)

# Retrieve scaling factors for specified approximation method
step_sizes_func, alphas_func = approximation_parameters(method)
alphas = alphas_func(n_steps + 1)
is_layer_tuple = isinstance(attrs, tuple)
attributions = attrs if is_layer_tuple else (attrs,)

if return_convergence_delta:
start_point, end_point = baselines, inputs
delta = self.compute_convergence_delta(
attributions,
start_point,
end_point,
target=target,
additional_forward_args=additional_forward_args,
)
return _format_attributions(is_layer_tuple, attributions), delta
return _format_attributions(is_layer_tuple, attributions)

def _attribute(
self,
inputs: Tuple[Tensor, ...],
baselines: Tuple[Union[Tensor, int, float], ...],
target: TargetType = None,
additional_forward_args: Any = None,
n_steps: int = 50,
method: str = "gausslegendre",
attribute_to_layer_input: bool = False,
step_sizes_and_alphas: Union[None, Tuple[List[float], List[float]]] = None,
) -> Union[Tensor, Tuple[Tensor, ...]]:
num_examples = inputs[0].shape[0]
if step_sizes_and_alphas is None:
# Retrieve scaling factors for specified approximation method
step_sizes_func, alphas_func = approximation_parameters(method)
alphas = alphas_func(n_steps + 1)
else:
_, alphas = step_sizes_and_alphas
# Compute scaled inputs from baseline to final input.
scaled_features_tpl = tuple(
torch.cat(
@@ -301,13 +357,15 @@ def attribute(

# Conductance Gradients - Returns gradient of output with respect to
# hidden layer and hidden layer evaluated at each input.
layer_gradients, layer_evals, is_layer_tuple = _batched_operator(
compute_layer_gradients_and_eval,
scaled_features_tpl,
input_additional_args,
internal_batch_size=internal_batch_size,
(
layer_gradients,
layer_evals,
is_layer_tuple,
) = compute_layer_gradients_and_eval(
forward_fn=self.forward_func,
layer=self.layer,
inputs=scaled_features_tpl,
additional_forward_args=input_additional_args,
target_ind=expanded_target,
device_ids=self.device_ids,
attribute_to_layer_input=attribute_to_layer_input,
@@ -335,15 +393,4 @@ def attribute(
layer_gradients, layer_evals, grad_diffs
)
)

if return_convergence_delta:
start_point, end_point = baselines, inputs
delta = self.compute_convergence_delta(
attributions,
start_point,
end_point,
target=target,
additional_forward_args=additional_forward_args,
)
return _format_attributions(is_layer_tuple, attributions), delta
return _format_attributions(is_layer_tuple, attributions)
5 changes: 3 additions & 2 deletions captum/attr/_core/layer/layer_integrated_gradients.py
Original file line number Diff line number Diff line change
@@ -211,9 +211,10 @@ def attribute(
`riemann_trapezoid` or `gausslegendre`.
Default: `gausslegendre` if no method is provided.
internal_batch_size (int, optional): Divides total #steps * #examples
data points into chunks of size internal_batch_size,
data points into chunks of size at most internal_batch_size,
which are computed (forward / backward passes)
sequentially.
sequentially. internal_batch_size must be at least equal to
#examples.
For DataParallel models, each batch is split among the
available devices, so evaluations on each available
device contain internal_batch_size / num_devices examples.
Loading