Initial version of async attribution with torch.futures (#1295)

yucu · facebook-github-bot · commit c2846616945e · 2024-06-11T12:05:16.000-07:00
Summary: Pull Request resolved: #1295 Differential Revision: D56764316
diff --git a/captum/_utils/common.py b/captum/_utils/common.py
@@ -15,6 +15,8 @@
     TupleOrTensorOrBoolGeneric,
 )
 from torch import device, Tensor
+
+# from torch.futures import Future
 from torch.nn import Module
 
 
@@ -514,7 +516,9 @@ def _run_forward(
     inputs: Any,
     target: TargetType = None,
     additional_forward_args: Any = None,
-) -> Tensor:
+):  # Annotate return type to Union[Tensor, Future[Tensor]]
+    # after PyTorch 1.6.0 support got dropped, otherwise
+    # it will complain 'pybind11_type' object is not subscriptable
     forward_func_args = signature(forward_func).parameters
     if len(forward_func_args) == 0:
         output = forward_func()
@@ -532,6 +536,8 @@ def _run_forward(
             else inputs
         )
     )
+    if isinstance(output, torch.futures.Future):
+        return output.then(lambda x: _select_targets(x.value(), target))
     return _select_targets(output, target)
 
 
diff --git a/captum/attr/_core/feature_ablation.py b/captum/attr/_core/feature_ablation.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 import math
-from typing import Any, Callable, cast, Tuple, Union
+from typing import Any, Callable, cast, List, Tuple, Union
 
 import torch
 from captum._utils.common import (
@@ -19,6 +19,7 @@
 from captum.attr._utils.common import _format_input_baseline
 from captum.log import log_usage
 from torch import dtype, Tensor
+from torch.futures import Future
 
 
 class FeatureAblation(PerturbationAttribution):
@@ -62,6 +63,7 @@ def __init__(self, forward_func: Callable) -> None:
         # input grow as expected. Once it turns to True, we will assume the model's
         # behavior stays consistent and no longer check again
         self._is_output_shape_valid = False
+        self.use_futures = False
 
     @log_usage()
     def attribute(
@@ -286,9 +288,19 @@ def attribute(
 
             # Computes initial evaluation with all features, which is compared
             # to each ablated result.
-            initial_eval = self._strict_run_forward(
+            initial_eval: Union[Tensor, Future[Tensor]] = _run_forward(
                 self.forward_func, inputs, target, additional_forward_args
             )
+            if self.use_futures:
+                assert isinstance(initial_eval, torch.Future), (
+                    "when use_futures is True, initial_eval should have "
+                    f"Future type rather than {type(initial_eval)}"
+                )
+
+                initial_eval.wait()
+                initial_eval = initial_eval.value()
+
+            initial_eval = self._parse_forward_out(initial_eval)
 
             if show_progress:
                 attr_progress.update()
@@ -301,7 +313,7 @@ def attribute(
             flattened_initial_eval = initial_eval.reshape(1, -1)
 
             # Initialize attribution totals and counts
-            attrib_type = cast(dtype, flattened_initial_eval.dtype)
+            attrib_type = flattened_initial_eval.dtype
 
             total_attrib = [
                 # attribute w.r.t each output element
@@ -313,6 +325,7 @@ def attribute(
                 for input in inputs
             ]
 
+            weights: List[Tensor] = []
             # Weights are used in cases where ablations may be overlapping.
             if self.use_weights:
                 weights = [
@@ -321,6 +334,7 @@ def attribute(
                     ).float()
                     for input in inputs
                 ]
+            all_futures = []
 
             # Iterate through each feature tensor for ablation
             for i in range(len(inputs)):
@@ -348,7 +362,7 @@ def attribute(
                     #   agg mode: (*initial_eval.shape)
                     #   non-agg mode:
                     #     (feature_perturbed * batch_size, *initial_eval.shape[1:])
-                    modified_eval = self._strict_run_forward(
+                    modified_eval = _run_forward(
                         self.forward_func,
                         current_inputs,
                         current_target,
@@ -358,61 +372,62 @@ def attribute(
                     if show_progress:
                         attr_progress.update()
 
-                    # if perturbations_per_eval > 1, the output shape must grow with
-                    # input and not be aggregated
-                    if perturbations_per_eval > 1 and not self._is_output_shape_valid:
-                        current_batch_size = current_inputs[0].shape[0]
-
-                        # number of perturbation, which is not the same as
-                        # perturbations_per_eval when not enough features to perturb
-                        n_perturb = current_batch_size / num_examples
-
-                        current_output_shape = modified_eval.shape
-
-                        # use initial_eval as the forward of perturbations_per_eval = 1
-                        initial_output_shape = initial_eval.shape
-
-                        assert (
-                            # check if the output is not a scalar
-                            current_output_shape
-                            and initial_output_shape
-                            # check if the output grow in same ratio, i.e., not agg
-                            and current_output_shape[0]
-                            == n_perturb * initial_output_shape[0]
-                        ), (
-                            "When perturbations_per_eval > 1, forward_func's output "
-                            "should be a tensor whose 1st dim grow with the input "
-                            f"batch size: when input batch size is {num_examples}, "
-                            f"the output shape is {initial_output_shape}; "
-                            f"when input batch size is {current_batch_size}, "
-                            f"the output shape is {current_output_shape}"
+                    if self.use_futures:
+                        assert isinstance(modified_eval, torch.Future), (
+                            "when use_futures is True, modified_eval should have "
+                            f"Future type rather than {type(modified_eval)}"
+                        )
+                        parsed_out_future = modified_eval.then(
+                            lambda x: self._parse_forward_out(x.value())
                         )
 
-                        self._is_output_shape_valid = True
-
-                    # reshape the leading dim for n_feature_perturbed
-                    # flatten each feature's eval outputs into 1D of (n_outputs)
-                    modified_eval = modified_eval.reshape(-1, n_outputs)
-                    # eval_diff in shape (n_feature_perturbed, n_outputs)
-                    eval_diff = flattened_initial_eval - modified_eval
-
-                    # append the shape of one input example
-                    # to make it broadcastable to mask
-                    eval_diff = eval_diff.reshape(
-                        eval_diff.shape + (inputs[i].dim() - 1) * (1,)
-                    )
-                    eval_diff = eval_diff.to(total_attrib[i].device)
+                        all_futures.append(
+                            parsed_out_future.then(
+                                lambda modified_eval_future, current_inputs=current_inputs, current_mask=current_mask, i=i: self.process_ablated_out(  # type: ignore # noqa: E501 line too long
+                                    modified_eval_future.value(),
+                                    current_inputs,
+                                    current_mask,
+                                    perturbations_per_eval,
+                                    num_examples,
+                                    initial_eval,
+                                    flattened_initial_eval,
+                                    inputs,
+                                    n_outputs,
+                                    total_attrib,
+                                    weights,
+                                    i,
+                                    attrib_type,
+                                )
+                            )
+                        )
+                        continue
 
-                    if self.use_weights:
-                        weights[i] += current_mask.float().sum(dim=0)
+                    modified_eval = self._parse_forward_out(modified_eval)
 
-                    total_attrib[i] += (eval_diff * current_mask.to(attrib_type)).sum(
-                        dim=0
+                    self.process_ablated_out(
+                        modified_eval,
+                        current_inputs,
+                        current_mask,
+                        perturbations_per_eval,
+                        num_examples,
+                        initial_eval,
+                        flattened_initial_eval,
+                        inputs,
+                        n_outputs,
+                        total_attrib,
+                        weights,
+                        i,
+                        attrib_type,
                     )
 
             if show_progress:
                 attr_progress.close()
 
+            if len(all_futures) > 0:
+                # torch.futures.Future.wait_all takes list of torch.futures.Future
+                # but will cast it to torch._C.Future internally.
+                torch.futures.wait_all(cast(List[Future], all_futures))
+
             # Divide total attributions by counts and return formatted attributions
             if self.use_weights:
                 attrib = tuple(
@@ -593,13 +608,12 @@ def _get_feature_counts(self, inputs, feature_mask, **kwargs):
             for inp, mask in zip(inputs, feature_mask)
         )
 
-    def _strict_run_forward(self, *args, **kwargs) -> Tensor:
+    def _parse_forward_out(self, forward_output) -> Tensor:
         """
         A temp wrapper for global _run_forward util to force forward output
         type assertion & conversion.
         Remove after the strict logic is supported by all attr classes
         """
-        forward_output = _run_forward(*args, **kwargs)
         if isinstance(forward_output, Tensor):
             return forward_output
 
@@ -612,4 +626,67 @@ def _strict_run_forward(self, *args, **kwargs) -> Tensor:
         # using python built-in type as torch dtype
         # int -> torch.int64, float -> torch.float64
         # ref: https://github.com/pytorch/pytorch/pull/21215
-        return torch.tensor(forward_output, dtype=output_type)
+        return torch.tensor(forward_output, dtype=cast(dtype, output_type))
+
+    def process_ablated_out(
+        self,
+        modified_eval,
+        current_inputs,
+        current_mask,
+        perturbations_per_eval,
+        num_examples,
+        initial_eval,
+        flattened_initial_eval,
+        inputs,
+        n_outputs,
+        total_attrib,
+        weights,
+        i,
+        attrib_type,
+    ):
+        # if perturbations_per_eval > 1, the output shape must grow with
+        # input and not be aggregated
+        if perturbations_per_eval > 1 and not self._is_output_shape_valid:
+            current_batch_size = current_inputs[0].shape[0]
+
+            # number of perturbation, which is not the same as
+            # perturbations_per_eval when not enough features to perturb
+            n_perturb = current_batch_size / num_examples
+
+            current_output_shape = modified_eval.shape
+
+            # use initial_eval as the forward of perturbations_per_eval = 1
+            initial_output_shape = initial_eval.shape
+
+            assert (
+                # check if the output is not a scalar
+                current_output_shape
+                and initial_output_shape
+                # check if the output grow in same ratio, i.e., not agg
+                and current_output_shape[0] == n_perturb * initial_output_shape[0]
+            ), (
+                "When perturbations_per_eval > 1, forward_func's output "
+                "should be a tensor whose 1st dim grow with the input "
+                f"batch size: when input batch size is {num_examples}, "
+                f"the output shape is {initial_output_shape}; "
+                f"when input batch size is {current_batch_size}, "
+                f"the output shape is {current_output_shape}"
+            )
+
+            self._is_output_shape_valid = True
+
+        # reshape the leading dim for n_feature_perturbed
+        # flatten each feature's eval outputs into 1D of (n_outputs)
+        modified_eval = modified_eval.reshape(-1, n_outputs)
+        # eval_diff in shape (n_feature_perturbed, n_outputs)
+        eval_diff = flattened_initial_eval - modified_eval
+
+        # append the shape of one input example
+        # to make it broadcastable to mask
+        eval_diff = eval_diff.reshape(eval_diff.shape + (inputs[i].dim() - 1) * (1,))
+        eval_diff = eval_diff.to(total_attrib[i].device)
+
+        if self.use_weights:
+            weights[i] += current_mask.float().sum(dim=0)
+
+        total_attrib[i] += (eval_diff * current_mask.to(attrib_type)).sum(dim=0)
diff --git a/captum/attr/_core/layer/layer_feature_permutation.py b/captum/attr/_core/layer/layer_feature_permutation.py
@@ -197,6 +197,11 @@ def forward_hook(module, inp, out=None):
             finally:
                 if hook is not None:
                     hook.remove()
+
+            if isinstance(eval, torch.futures.Future):
+                eval.wait()
+                eval = eval.value()
+
             return eval
 
         with torch.no_grad():
diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -474,6 +474,10 @@ def layer_forward_hook(
                         if hook is not None:
                             hook.remove()
 
+                if isinstance(output, torch.futures.Future):
+                    output.wait()
+                    output = output.value()
+
                 assert output[0].numel() == 1, (
                     "Target not provided when necessary, cannot"
                     " take gradient with respect to multiple outputs."
diff --git a/captum/attr/_core/lrp.py b/captum/attr/_core/lrp.py
@@ -4,6 +4,8 @@
 from collections import defaultdict
 from typing import Any, cast, List, Tuple, Union
 
+import torch
+
 import torch.nn as nn
 from captum._utils.common import (
     _format_output,
@@ -358,6 +360,10 @@ def _compute_output_and_change_weights(
         # adjustments as inputs to the layers with adjusted weights. This procedure
         # is important for graph generation in the 2nd forward pass.
         self._register_pre_hooks()
+
+        if isinstance(output, torch.futures.Future):
+            output.wait()
+            output = output.value()
         return output
 
     def _remove_forward_hooks(self) -> None:
diff --git a/captum/attr/_core/shapley_value.py b/captum/attr/_core/shapley_value.py
@@ -3,7 +3,7 @@
 import itertools
 import math
 import warnings
-from typing import Any, Callable, Iterable, Sequence, Tuple, Union
+from typing import Any, Callable, cast, Iterable, Sequence, Tuple, Union
 
 import torch
 from captum._utils.common import (
@@ -27,7 +27,7 @@
     _tensorize_baseline,
 )
 from captum.log import log_usage
-from torch import Tensor
+from torch import dtype, Tensor
 
 
 def _all_perm_generator(num_features: int, num_samples: int) -> Iterable[Sequence[int]]:
@@ -551,7 +551,7 @@ def _strict_run_forward(self, *args, **kwargs) -> Tensor:
         # using python built-in type as torch dtype
         # int -> torch.int64, float -> torch.float64
         # ref: https://github.com/pytorch/pytorch/pull/21215
-        return torch.tensor([forward_output], dtype=output_type)
+        return torch.tensor([forward_output], dtype=cast(dtype, output_type))
 
 
 class ShapleyValues(ShapleyValueSampling):
diff --git a/captum/attr/_utils/attribution.py b/captum/attr/_utils/attribution.py
diff --git a/tests/attr/test_feature_ablation.py b/tests/attr/test_feature_ablation.py