Merge pull request #186 from Visual-Behavior/dev

thibo73800 · web-flow · commit 62cdfcb0cf25 · 2022-05-31T10:40:19.000+02:00
add cuda shared memory for reccurent engines
diff --git a/alonet/torch2trt/TRTExecutor.py b/alonet/torch2trt/TRTExecutor.py
@@ -59,6 +59,7 @@ def __init__(
         sync_mode: bool = False,
         verbose_logger: bool = False,
         profiling: bool = False,
+        shared_mem: dict = {},
     ):
         """
         Parameters
@@ -69,11 +70,20 @@ def __init__(
         sync_mode: bool, default = False.
             True/False enable the synchronized/asynchonized execution of TensorRT engine
         logger: tensorrt.ILogger, logger to print info in terminal
+        shared_mem: dict, input and output that share the same memory. Default {}.
+                    Output is redirected to the input after each execution.
+                    Exemple : shared_mem = {0: 1} makes input of index 0 share memory with output of index 1.
         """
+        assert isinstance(shared_mem, dict), f"shared_mem argument should be of type dict but got {shared_mem.__class__.__name__} instead"
+        if shared_mem != {}:
+            print("[WARNING] outputs with shared memory are static, please set outputs_to_cpu=True when executing if you want to retrieve them.")
+            for inp, out in shared_mem.items():
+                print(f"[INFO] input of index {inp} has shared memory with output of index {out}.")
         if prod_package_error is not None:
             raise prod_package_error
         self.sync_mode = sync_mode
         self.stream = stream
+        self.shared_mem = shared_mem
         if verbose_logger:
             self.logger = trt.Logger(trt.Logger.VERBOSE)
         else:
@@ -91,7 +101,7 @@ def __init__(
             self.context.profiler = CustomProfiler()
         # Allocate_buffer take into account if engine has dynamic axes
         self.inputs, self.outputs, self.stream, self.has_dynamic_axes = allocate_buffers(
-            self.context, self.stream, self.sync_mode
+            self.context, self.stream, self.sync_mode, self.shared_mem,
         )
         self.dict_inputs = {mem_obj.name: mem_obj for mem_obj in self.inputs}
         self.dict_outputs = {mem_obj.name: mem_obj for mem_obj in self.outputs}
@@ -111,17 +121,61 @@ def print_bindings_info(self):
                     shape: {self.engine.get_binding_shape(i)}, dtype: {self.engine.get_binding_dtype(i)}"
             )
 
-    def execute(self):
+    def execute(self, inputs_from_cpu=False, outputs_to_cpu=False):
+        """Executes engine
+        
+        Parameters
+        ----------
+        inputs_from_cpu: bool, reload inputs from CPU again.
+        outputs_from_cpu: bool, transfer back all outputs back from GPU to CPU.
+
+        Examples
+        --------
+        ~ Example of engine with 2 inputs eand 2 outputs ~
+        >>> # Normal use.
+        >>> engine = TRTExecutor(**kwrags)
+        >>> engine.inputs[0].host, engine.inpts[1].host = np.ones(1), np.ones(1)
+        >>> outputs = engine.execute()
+        >>>
+        >>> # Engine with shared memory: redirecting output of index 1 to input of index 0.
+        >>> engine = TRTExecutor(shared_mem={0: 1}, **kwargs)
+        >>> for i, inp in enumerate(inputs):
+        >>>     engine.inputs[1].host = inp
+        >>>     ekwargs = {}
+        >>>     if i == 0:
+        >>>         ## First time only
+        >>>         engine.input[0].host = np.zeros(0)
+        >>>         ekawrgs["inputs_from_cpu"] = True
+        >>>     engine.execute(**ekwargs)
+        >>> # Retieve last output 1 from gpu
+        >>> engine.execute(outputs_to_cpu)
+        >>> print(engine.outputs.host[1])
+        """
         if self.has_dynamic_axes:
             # Set input shape in context to update output shapes
             allocate_dynamic_mem(self.context, self.dict_inputs, self.dict_outputs)
 
         if self.sync_mode:
-            execute_sync(self.context, bindings=self.bindings, inputs=self.inputs, outputs=self.outputs)
+            execute_sync(
+                self.context,
+                bindings=self.bindings,
+                inputs=self.inputs,
+                outputs=self.outputs,
+                shared_mem=self.shared_mem,
+                inputs_from_cpu=inputs_from_cpu,
+                outputs_to_cpu=outputs_to_cpu,
+                )
         else:
             execute_async(
-                self.context, bindings=self.bindings, inputs=self.inputs, outputs=self.outputs, stream=self.stream
-            )
+                self.context,
+                bindings=self.bindings,
+                inputs=self.inputs,
+                outputs=self.outputs,
+                stream=self.stream,
+                shared_mem=self.shared_mem,
+                inputs_from_cpu=inputs_from_cpu,
+                outputs_to_cpu=outputs_to_cpu,
+                )
         return {out.name: out.host for out in self.outputs}
 
     def set_binding_shape(self, binding: int, shape: tuple):
diff --git a/alonet/torch2trt/utils.py b/alonet/torch2trt/utils.py
@@ -191,7 +191,7 @@ def release(self):
         self.shape = None
 
 
-def allocate_buffers(context, stream=None, sync_mode=True):
+def allocate_buffers(context, stream=None, sync_mode=True, shared_mem={}):
     """
     Read bindings' information in ExecutionContext, create pagelocked np.ndarray in CPU,
     allocate corresponding memory in GPU.
@@ -222,7 +222,10 @@ def allocate_buffers(context, stream=None, sync_mode=True):
 
     inputs = []
     outputs = []
+    out_pointer = 0
     has_dynamic_axes = False
+    inv_shared_mem = {v: k for k, v in shared_mem.items()}
+
     if stream is None and not sync_mode:
         stream = cuda.Stream()
     for binding in context.engine:
@@ -237,14 +240,27 @@ def allocate_buffers(context, stream=None, sync_mode=True):
         else:
             size = trt.volume(shape) * context.engine.max_batch_size
             # Allocate host and device buffers
-            host_mem = cuda.pagelocked_empty(size, dtype)
-            device_mem = cuda.mem_alloc(host_mem.nbytes)
+            if not context.engine.binding_is_input(binding):
+                if out_pointer in shared_mem.values():
+                    # avoid allocating memory in gpu, just pass the same device_mem and host that corresponds.
+                    input_idx = inv_shared_mem[out_pointer]
+                    device_mem = inputs[input_idx].device
+                    host_mem = inputs[input_idx].host
+                else:
+                    host_mem = cuda.pagelocked_empty(size, dtype)
+                    device_mem = cuda.mem_alloc(host_mem.nbytes)
+                out_pointer += 1
+                
+            else:
+                host_mem = cuda.pagelocked_empty(size, dtype)
+                device_mem = cuda.mem_alloc(host_mem.nbytes)
             mem_obj = HostDeviceMem(host_mem, device_mem, shape, dtype, binding)
         # Append to the appropriate list.
         if context.engine.binding_is_input(binding):
             inputs.append(mem_obj)
         else:
             outputs.append(mem_obj)
+
     return inputs, outputs, stream, has_dynamic_axes
 
 
@@ -305,7 +321,7 @@ def get_bindings(context, dict_inputs, dict_outputs):
     return bindings
 
 
-def execute_async(context, bindings, inputs, outputs, stream):
+def execute_async(context, bindings, inputs, outputs, stream, shared_mem, inputs_from_cpu, outputs_to_cpu):
     """
     Execute an TensorRT engine.
 
@@ -318,19 +334,32 @@ def execute_async(context, bindings, inputs, outputs, stream):
     outputs: list[HostDeviceMem]
     stream: pycuda.driver.Stream
         used for memory transfers between CPU-GPU
+    inputs_from_cpu: bool, reload inputs from CPU again.
+    outputs_from_cpu: bool, transfer back all outputs back from GPU to CPU.
 
     Returns
     -------
     list : np.ndarray
         For each outputs of the engine
     """
     # Transfer input data to the GPU.
-    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
+    if inputs_from_cpu:
+        # Reload all inputs from "inputs"
+        [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
+    else:
+        # Reload all inputs from "inputs" except the ones with shared memory.
+        [cuda.memcpy_htod_async(inp.device, inp.host, stream) for i, inp in enumerate(inputs) if i not in shared_mem.keys()]
+
     # Run inference.
     check = context.execute_async(bindings=bindings, stream_handle=stream.handle)
     assert check, "Kernel execution failed"
     # Transfer predictions back from the GPU.
-    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
+    if outputs_to_cpu:
+        # All outputs
+        [cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
+    else:
+        # only outputs with no memory shared
+        [cuda.memcpy_dtoh_async(out.host, out.device, stream) for i, out in enumerate(outputs) if i not in shared_mem.values()]
     # Synchronize the stream
     stream.synchronize()
     # Return only the host outputs.
@@ -339,7 +368,7 @@ def execute_async(context, bindings, inputs, outputs, stream):
     return [out.host for out in outputs]
 
 
-def execute_sync(context, bindings, inputs, outputs):
+def execute_sync(context, bindings, inputs, outputs, shared_mem, inputs_from_cpu, outputs_to_cpu):
     """
     Execute an TensorRT engine.
 
@@ -352,25 +381,36 @@ def execute_sync(context, bindings, inputs, outputs):
     outputs: list[HostDeviceMem]
     stream: pycuda.driver.Stream
         used for memory transfers between CPU-GPU
+    inputs_from_cpu: bool, reload inputs from CPU again.
+    outputs_to_cpu: bool, transfer back all outputs back from GPU to CPU.
 
     Parameters
     ----------
     list[np.ndarray] for each outputs of the engine
     """
     # Transfer input data to the GPU.
-    [cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
+    if inputs_from_cpu:
+        # Reload all inputs from "inputs".
+        [cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
+    else:
+        # Reload all inputs from "inputs" except the ones with shared memory.
+        [cuda.memcpy_htod(inp.device, inp.host) for i, inp in enumerate(inputs) if i not in shared_mem.keys()]
     # Run inference.
     check = context.execute_v2(bindings=bindings)
     assert check, "Kernel execution failed"
     # Transfer predictions back from the GPU.
-    [cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
+    if outputs_to_cpu:
+        # All outputs
+        [cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
+    else:
+        # only outputs with no memory shared
+        [cuda.memcpy_dtoh(out.host, out.device) for i, out in enumerate(outputs) if i not in shared_mem.values()]
     # Return only the host outputs.
     for out in outputs:
         out.host = out.host.reshape(out.shape)
     return [out.host for out in outputs]
 
 
-
 def rename_nodes_(graph, verbose=False):
 
     dont_rename = [v.name for v in graph.inputs + graph.outputs]