DataParallel DeepLift Fixes (#335)

vivekmig · facebook-github-bot · commit b8e418649d7a · 2020-04-03T15:39:19.000-07:00
Summary: This PR adds only the bug fixes identified from refactoring and adding dynamic tests. 1. Additional forward arguments were not working appropriately with DeepLift on a DataParallel model. This is because the device split of expanded additional forward args didn't necessarily match that of inputs. The behavior has been changed to expand the additional args in the hook function (after device split), which ensures the additional args and inputs remain matched. 2. Different targets per example was not working appropriately with DeepLift on a DataParallel model. This is because the model output concatenated the outputs of the devices in DataParallel, which mixed input / baseline outputs, inhibiting appropriate matching between input example and target. Additional forward hooks have been added to appropriately return the output with all inputs followed by all baselines. 3. GradCAM is primarily intended for layers with >= 3 dimensions, since it computes average gradient for each example / channel. For layers with 2 dimensions, the mean gradient over all dimensions was being taken. This has been updated to use the layer gradients directly in this case, which better aligns with the behavior for >= 3 dimensions. 4. DeepLiftShap (and Neuron / Layer variants) were incorrectly repeating additional forward args, this has been fixed to use repeat interleave instead. Pull Request resolved: #335 Reviewed By: edward-io Differential Revision: D20844511 Pulled By: vivekmig fbshipit-source-id: c895b348c3d5c56355c39d429947f2f36dda37a7
diff --git a/captum/attr/_core/deep_lift.py b/captum/attr/_core/deep_lift.py
@@ -20,6 +20,7 @@
     _format_tensor_into_tuples,
     _is_tuple,
     _run_forward,
+    _select_targets,
 )
 from ..._utils.typing import (
     BaselineType,
@@ -304,22 +305,20 @@ def attribute(  # type: ignore
         )
 
         baselines = _tensorize_baseline(inputs, baselines)
-        main_model_pre_hook = self._pre_hook_main_model()
+        main_model_pre_hooks = self._hook_main_model()
 
         self.model.apply(self._register_hooks)
 
         additional_forward_args = _format_additional_forward_args(
             additional_forward_args
         )
-        input_base_additional_args = _expand_additional_forward_args(
-            additional_forward_args, 2, ExpansionTypes.repeat
-        )
+
         expanded_target = _expand_target(
             target, 2, expansion_type=ExpansionTypes.repeat
         )
 
         wrapped_forward_func = self._construct_forward_func(
-            self.model, (inputs, baselines), expanded_target, input_base_additional_args
+            self.model, (inputs, baselines), expanded_target, additional_forward_args
         )
         gradients = self.gradient_func(wrapped_forward_func, inputs)
         if custom_attribution_func is None:
@@ -332,7 +331,9 @@ def attribute(  # type: ignore
                 custom_attribution_func, gradients, inputs, baselines
             )
         # remove hooks from all activations
-        main_model_pre_hook.remove()
+        for hook in main_model_pre_hooks:
+            hook.remove()
+
         self._remove_hooks()
 
         undo_gradient_requirements(inputs, gradient_mask)
@@ -355,7 +356,12 @@ def _construct_forward_func(
         additional_forward_args: Any = None,
     ) -> Callable:
         def forward_fn():
-            return _run_forward(forward_func, inputs, target, additional_forward_args)
+            model_out = _run_forward(
+                forward_func, inputs, None, additional_forward_args
+            )
+            return _select_targets(
+                torch.cat((model_out[:, 0], model_out[:, 1])), target
+            )
 
         if hasattr(forward_func, "device_ids"):
             forward_fn.device_ids = forward_func.device_ids  # type: ignore
@@ -501,7 +507,7 @@ def _remove_hooks(self) -> None:
         for backward_handle in self.backward_handles:
             backward_handle.remove()
 
-    def _pre_hook_main_model(self) -> RemovableHandle:
+    def _hook_main_model(self) -> List[RemovableHandle]:
         def pre_hook(module: Module, baseline_inputs_add_args: Tuple) -> Tuple:
             inputs = baseline_inputs_add_args[0]
             baselines = baseline_inputs_add_args[1]
@@ -514,13 +520,28 @@ def pre_hook(module: Module, baseline_inputs_add_args: Tuple) -> Tuple:
                 for input, baseline in zip(inputs, baselines)
             )
             if additional_args is not None:
-                return (*baseline_input_tsr, *additional_args)
+                expanded_additional_args = cast(
+                    Tuple,
+                    _expand_additional_forward_args(
+                        additional_args, 2, ExpansionTypes.repeat
+                    ),
+                )
+                return (*baseline_input_tsr, *expanded_additional_args)
             return baseline_input_tsr
 
+        def forward_hook(module: Module, inputs: Tuple, outputs: Tensor):
+            return torch.stack(torch.chunk(outputs, 2), dim=1)
+
         if isinstance(self.model, nn.DataParallel):
-            return self.model.module.register_forward_pre_hook(pre_hook)  # type: ignore
+            return [
+                self.model.module.register_forward_pre_hook(pre_hook),  # type: ignore
+                self.model.module.register_forward_hook(forward_hook),
+            ]  # type: ignore
         else:
-            return self.model.register_forward_pre_hook(pre_hook)  # type: ignore
+            return [
+                self.model.register_forward_pre_hook(pre_hook),  # type: ignore
+                self.model.register_forward_hook(forward_hook),
+            ]  # type: ignore
 
     def has_convergence_delta(self) -> bool:
         return True
@@ -810,7 +831,11 @@ def _expand_inputs_baselines_targets(
             target, base_bsz, expansion_type=ExpansionTypes.repeat_interleave
         )
         input_additional_args = (
-            _expand_additional_forward_args(additional_forward_args, base_bsz)
+            _expand_additional_forward_args(
+                additional_forward_args,
+                base_bsz,
+                expansion_type=ExpansionTypes.repeat_interleave,
+            )
             if additional_forward_args is not None
             else None
         )
diff --git a/captum/attr/_core/feature_ablation.py b/captum/attr/_core/feature_ablation.py
@@ -243,7 +243,7 @@ def attribute(
         feature_mask = _format_input(feature_mask) if feature_mask is not None else None
         assert (
             isinstance(perturbations_per_eval, int) and perturbations_per_eval >= 1
-        ), "Ablations per evaluation must be at least 1."
+        ), "Perturbations per evaluation must be an integer and at least 1."
         with torch.no_grad():
             # Computes initial evaluation with all features, which is compared
             # to each ablated result.
diff --git a/captum/attr/_core/layer/grad_cam.py b/captum/attr/_core/layer/grad_cam.py
@@ -207,6 +207,8 @@ def attribute(
                 dim=tuple(x for x in range(2, len(layer_grad.shape))),
                 keepdim=True,
             )
+            if len(layer_grad.shape) > 2
+            else layer_grad
             for layer_grad in layer_gradients
         )
 
diff --git a/captum/attr/_core/layer/layer_deep_lift.py b/captum/attr/_core/layer/layer_deep_lift.py
@@ -8,7 +8,6 @@
 
 from ...._utils.common import (
     ExpansionTypes,
-    _expand_additional_forward_args,
     _expand_target,
     _format_additional_forward_args,
     _format_input,
@@ -277,21 +276,18 @@ def attribute(
 
         baselines = _tensorize_baseline(inputs, baselines)
 
-        main_model_pre_hook = self._pre_hook_main_model()
+        main_model_hooks = self._hook_main_model()
 
         self.model.apply(self._register_hooks)
 
         additional_forward_args = _format_additional_forward_args(
             additional_forward_args
         )
-        input_base_additional_args = _expand_additional_forward_args(
-            additional_forward_args, 2, ExpansionTypes.repeat
-        )
         expanded_target = _expand_target(
             target, 2, expansion_type=ExpansionTypes.repeat
         )
         wrapped_forward_func = self._construct_forward_func(
-            self.model, (inputs, baselines), expanded_target, input_base_additional_args
+            self.model, (inputs, baselines), expanded_target, additional_forward_args,
         )
 
         def chunk_output_fn(out: TensorOrTupleOfTensorsGeneric,) -> Sequence:
@@ -323,8 +319,9 @@ def chunk_output_fn(out: TensorOrTupleOfTensorsGeneric,) -> Sequence:
                 custom_attribution_func, gradients, attr_inputs, attr_baselines
             )
         # remove hooks from all activations
-        main_model_pre_hook.remove()
         self._remove_hooks()
+        for hook in main_model_hooks:
+            hook.remove()
 
         undo_gradient_requirements(inputs, gradient_mask)
         return _compute_conv_delta_and_format_attrs(

Original file line number	Diff line number	Diff line change
`@@ -207,6 +207,8 @@ def attribute(`
`207`	`207`	`dim=tuple(x for x in range(2, len(layer_grad.shape))),`
`208`	`208`	`keepdim=True,`
`209`	`209`	`)`
	`210`	`+ if len(layer_grad.shape) > 2`
	`211`	`+ else layer_grad`
`210`	`212`	`for layer_grad in layer_gradients`
`211`	`213`	`)`
`212`	`214`