support multi-gpu prediction (#350)

HydrogenSulfate · web-flow · commit 0e0089bf8c06 · 2023-05-29T14:59:28.000+08:00
diff --git a/ppsci/solver/solver.py b/ppsci/solver/solver.py
@@ -20,6 +20,7 @@
 import os
 import sys
 from typing import Any
+from typing import Callable
 from typing import Dict
 from typing import Optional
 from typing import Union
@@ -261,7 +262,7 @@ def __init__(
             logger.warning(
                 f"Detected world_size({self.world_size}) > 1, it is recommended to "
                 "scale up the learning rate and reduce the epochs or "
-                "iters_per_epoch according to the world_size number both linearly."
+                "iters_per_epoch according to the world_size both linearly."
             )
 
         self.global_step = 0
@@ -468,55 +469,100 @@ def visualize(self, epoch_id: int = 0):
         self.visu_func(self, epoch_id)
         logger.info(f"[Visualize][Epoch {epoch_id}] Finished visualization")
 
-    @paddle.no_grad()
     @misc.run_on_eval_mode
     def predict(
         self,
         input_dict: Dict[str, Union[np.ndarray, paddle.Tensor]],
+        expr_dict: Optional[Dict[str, Callable]] = None,
         batch_size: int = 64,
+        no_grad: bool = True,
     ) -> Dict[str, paddle.Tensor]:
-        """Pure prediction using model.forward(...), support single device prediction yet.
+        """Pure prediction using model.forward(...) and expression(optional, if given).
 
         Args:
             input_dict (Dict[str, Union[np.ndarray, paddle.Tensor]]): Input data in dict.
+            expr_dict (Optional[Dict[str, Callable]]): Expression dict, which guide to
+                compute equation variable with callable function. Defaults to None.
             batch_size (int, optional): Predicting by batch size. Defaults to 64.
-
+            no_grad (bool): Whether set stop_gradient=True for entire prediction, mainly
+                for memory-efficiency. Defaults to True.
         Returns:
             Dict[str, paddle.Tensor]: Prediction in dict.
         """
-        if self.world_size > 1:
-            raise NotImplementedError(
-                "Solver.predict only support single device yet, "
-                f"but got {self.world_size} devices."
-            )
-
         num_samples = len(next(iter(input_dict.values())))
-        batch_num = (num_samples + (batch_size - 1)) // batch_size
+        num_pad = (self.world_size - num_samples % self.world_size) % self.world_size
+        # pad with last element if `num_samples` is not divisible by `world_size`
+        # ensuring every device get same number of data.
+        if num_pad > 0:
+            for k, v in input_dict.items():
+                repeat_times = (num_pad, *(1 for _ in range(v.ndim - 1)))
+                input_dict[k] = paddle.concat(
+                    (
+                        v,
+                        paddle.tile(v[num_samples - 1 : num_samples], repeat_times),
+                    ),
+                )
+
+        num_samples_pad = num_samples + num_pad
+        local_num_samples_pad = num_samples_pad // self.world_size
+        local_input_dict = (
+            {k: v[self.rank :: self.world_size] for k, v in input_dict.items()}
+            if self.world_size > 1
+            else input_dict
+        )
+        local_batch_num = (local_num_samples_pad + (batch_size - 1)) // batch_size
         pred_dict = misc.Prettydefaultdict(list)
-        for batch_id in range(batch_num):
-            batch_input_dict = {}
-            st = batch_id * batch_size
-            ed = min(num_samples, (batch_id + 1) * batch_size)
-
-            # prepare batch input dict
-            for key in input_dict:
-                if not paddle.is_tensor(input_dict[key]):
-                    batch_input_dict[key] = paddle.to_tensor(
-                        input_dict[key][st:ed], paddle.get_default_dtype()
+        with self.no_grad_context_manager(no_grad), self.no_sync_context_manager(
+            self.world_size > 1, self.model
+        ):
+            for batch_id in range(local_batch_num):
+                batch_input_dict = {}
+                st = batch_id * batch_size
+                ed = min(local_num_samples_pad, (batch_id + 1) * batch_size)
+
+                # prepare batch input dict
+                for key in local_input_dict:
+                    if not paddle.is_tensor(local_input_dict[key]):
+                        batch_input_dict[key] = paddle.to_tensor(
+                            local_input_dict[key][st:ed], paddle.get_default_dtype()
+                        )
+                    else:
+                        batch_input_dict[key] = local_input_dict[key][st:ed]
+                    batch_input_dict[key].stop_gradient = no_grad
+
+                # forward
+                with self.autocast_context_manager(self.use_amp, self.amp_level):
+                    batch_output_dict = self.forward_helper.visu_forward(
+                        expr_dict, batch_input_dict, self.model
                     )
-                else:
-                    batch_input_dict[key] = input_dict[key][st:ed]
-                batch_input_dict[key].stop_gradient = False
-
-            # forward
-            with self.autocast_context_manager(self.use_amp, self.amp_level):
-                batch_output_dict = self.model(batch_input_dict)
 
-            # collect batch data
-            for key, batch_output in batch_output_dict.items():
-                pred_dict[key].append(batch_output)
-
-        pred_dict = {key: paddle.concat(value) for key, value in pred_dict.items()}
+                # collect batch data
+                for key, batch_output in batch_output_dict.items():
+                    pred_dict[key].append(batch_output.detach())
+
+            # concatenate local predictions
+            pred_dict = {key: paddle.concat(value) for key, value in pred_dict.items()}
+
+            if self.world_size > 1:
+                # gather global predictions from all devices if world_size > 1
+                pred_dict = {
+                    key: misc.all_gather(value) for key, value in pred_dict.items()
+                }
+
+                # rearange predictions as the same order of input_dict according to inverse
+                # permutation, then discard predictions of padding data at the end
+                perm = np.arange(num_samples_pad, dtype="int64")
+                perm = np.concatenate(
+                    [perm[rank :: self.world_size] for rank in range(self.world_size)],
+                    axis=0,
+                )
+                perm_inv = np.empty_like(perm)
+                perm_inv[perm] = np.arange(num_samples_pad, dtype="int64")
+                perm_inv = paddle.to_tensor(perm_inv)
+                pred_dict = {
+                    key: value[perm_inv][:num_samples]
+                    for key, value in pred_dict.items()
+                }
 
         return pred_dict
 
@@ -599,7 +645,7 @@ def no_sync_context_manager(
             if not isinstance(ddp_model, paddle.DataParallel):
                 raise TypeError(
                     "no_sync interface is only for model with type paddle.DataParallel, "
-                    f"but got type {type(ddp_model)}"
+                    f"but got type {misc.typename(ddp_model)}"
                 )
             ctx_manager = ddp_model.no_sync()
         else:
diff --git a/ppsci/solver/visu.py b/ppsci/solver/visu.py
@@ -14,13 +14,17 @@
 
 import os
 import os.path as osp
+from typing import TYPE_CHECKING
 
 import paddle
 
+if TYPE_CHECKING:
+    from ppsci import solver
+
 from ppsci.utils import misc
 
 
-def visualize_func(solver, epoch_id: int):
+def visualize_func(solver: "solver.Solver", epoch_id: int):
     """Visualization program
 
     Args:
diff --git a/ppsci/utils/expression.py b/ppsci/utils/expression.py
@@ -15,6 +15,7 @@
 from typing import TYPE_CHECKING
 from typing import Callable
 from typing import Dict
+from typing import Optional
 from typing import Tuple
 
 import paddle
@@ -121,23 +122,23 @@ def eval_forward(
 
     def visu_forward(
         self,
-        expr_dict: Dict[str, Callable],
+        expr_dict: Optional[Dict[str, Callable]],
         input_dict: Dict[str, paddle.Tensor],
         model: nn.Layer,
     ):
         # model forward
-        if callable(next(iter(expr_dict.values()))):
-            output_dict = model(input_dict)
+        output_dict = model(input_dict)
 
-        # equation forward
-        for name, expr in expr_dict.items():
-            if callable(expr):
-                output_dict[name] = expr({**output_dict, **input_dict})
-            else:
-                raise TypeError(f"expr type({type(expr)}) is invalid")
+        if isinstance(expr_dict, dict):
+            # equation forward
+            for name, expr in expr_dict.items():
+                if callable(expr):
+                    output_dict[name] = expr({**output_dict, **input_dict})
+                else:
+                    raise TypeError(f"expr type({type(expr)}) is invalid")
 
-        # clear differentiation cache
-        clear()
+            # clear differentiation cache
+            clear()
 
         # compute loss for each validator according to its' own output, label and weight
         return output_dict
diff --git a/ppsci/utils/misc.py b/ppsci/utils/misc.py
@@ -19,6 +19,7 @@
 from typing import Dict
 from typing import List
 from typing import Tuple
+from typing import Union
 
 import numpy as np
 import paddle
@@ -121,7 +122,7 @@ def convert_to_dict(array: np.ndarray, keys: Tuple[str, ...]) -> Dict[str, np.nd
 
 def all_gather(
     tensor: paddle.Tensor, concat: bool = True, axis: int = 0
-) -> List[paddle.Tensor]:
+) -> Union[paddle.Tensor, List[paddle.Tensor]]:
     """Gather tensor from all devices, concatenate them along given axis if specified.
 
     Args: