Skip to content

Commit 62cdfcb

Browse files
authored
Merge pull request #186 from Visual-Behavior/dev
add cuda shared memory for reccurent engines
2 parents fee0628 + bc6380f commit 62cdfcb

File tree

2 files changed

+109
-15
lines changed

2 files changed

+109
-15
lines changed

alonet/torch2trt/TRTExecutor.py

Lines changed: 59 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ def __init__(
5959
sync_mode: bool = False,
6060
verbose_logger: bool = False,
6161
profiling: bool = False,
62+
shared_mem: dict = {},
6263
):
6364
"""
6465
Parameters
@@ -69,11 +70,20 @@ def __init__(
6970
sync_mode: bool, default = False.
7071
True/False enable the synchronized/asynchonized execution of TensorRT engine
7172
logger: tensorrt.ILogger, logger to print info in terminal
73+
shared_mem: dict, input and output that share the same memory. Default {}.
74+
Output is redirected to the input after each execution.
75+
Exemple : shared_mem = {0: 1} makes input of index 0 share memory with output of index 1.
7276
"""
77+
assert isinstance(shared_mem, dict), f"shared_mem argument should be of type dict but got {shared_mem.__class__.__name__} instead"
78+
if shared_mem != {}:
79+
print("[WARNING] outputs with shared memory are static, please set outputs_to_cpu=True when executing if you want to retrieve them.")
80+
for inp, out in shared_mem.items():
81+
print(f"[INFO] input of index {inp} has shared memory with output of index {out}.")
7382
if prod_package_error is not None:
7483
raise prod_package_error
7584
self.sync_mode = sync_mode
7685
self.stream = stream
86+
self.shared_mem = shared_mem
7787
if verbose_logger:
7888
self.logger = trt.Logger(trt.Logger.VERBOSE)
7989
else:
@@ -91,7 +101,7 @@ def __init__(
91101
self.context.profiler = CustomProfiler()
92102
# Allocate_buffer take into account if engine has dynamic axes
93103
self.inputs, self.outputs, self.stream, self.has_dynamic_axes = allocate_buffers(
94-
self.context, self.stream, self.sync_mode
104+
self.context, self.stream, self.sync_mode, self.shared_mem,
95105
)
96106
self.dict_inputs = {mem_obj.name: mem_obj for mem_obj in self.inputs}
97107
self.dict_outputs = {mem_obj.name: mem_obj for mem_obj in self.outputs}
@@ -111,17 +121,61 @@ def print_bindings_info(self):
111121
shape: {self.engine.get_binding_shape(i)}, dtype: {self.engine.get_binding_dtype(i)}"
112122
)
113123

114-
def execute(self):
124+
def execute(self, inputs_from_cpu=False, outputs_to_cpu=False):
125+
"""Executes engine
126+
127+
Parameters
128+
----------
129+
inputs_from_cpu: bool, reload inputs from CPU again.
130+
outputs_from_cpu: bool, transfer back all outputs back from GPU to CPU.
131+
132+
Examples
133+
--------
134+
~ Example of engine with 2 inputs eand 2 outputs ~
135+
>>> # Normal use.
136+
>>> engine = TRTExecutor(**kwrags)
137+
>>> engine.inputs[0].host, engine.inpts[1].host = np.ones(1), np.ones(1)
138+
>>> outputs = engine.execute()
139+
>>>
140+
>>> # Engine with shared memory: redirecting output of index 1 to input of index 0.
141+
>>> engine = TRTExecutor(shared_mem={0: 1}, **kwargs)
142+
>>> for i, inp in enumerate(inputs):
143+
>>> engine.inputs[1].host = inp
144+
>>> ekwargs = {}
145+
>>> if i == 0:
146+
>>> ## First time only
147+
>>> engine.input[0].host = np.zeros(0)
148+
>>> ekawrgs["inputs_from_cpu"] = True
149+
>>> engine.execute(**ekwargs)
150+
>>> # Retieve last output 1 from gpu
151+
>>> engine.execute(outputs_to_cpu)
152+
>>> print(engine.outputs.host[1])
153+
"""
115154
if self.has_dynamic_axes:
116155
# Set input shape in context to update output shapes
117156
allocate_dynamic_mem(self.context, self.dict_inputs, self.dict_outputs)
118157

119158
if self.sync_mode:
120-
execute_sync(self.context, bindings=self.bindings, inputs=self.inputs, outputs=self.outputs)
159+
execute_sync(
160+
self.context,
161+
bindings=self.bindings,
162+
inputs=self.inputs,
163+
outputs=self.outputs,
164+
shared_mem=self.shared_mem,
165+
inputs_from_cpu=inputs_from_cpu,
166+
outputs_to_cpu=outputs_to_cpu,
167+
)
121168
else:
122169
execute_async(
123-
self.context, bindings=self.bindings, inputs=self.inputs, outputs=self.outputs, stream=self.stream
124-
)
170+
self.context,
171+
bindings=self.bindings,
172+
inputs=self.inputs,
173+
outputs=self.outputs,
174+
stream=self.stream,
175+
shared_mem=self.shared_mem,
176+
inputs_from_cpu=inputs_from_cpu,
177+
outputs_to_cpu=outputs_to_cpu,
178+
)
125179
return {out.name: out.host for out in self.outputs}
126180

127181
def set_binding_shape(self, binding: int, shape: tuple):

alonet/torch2trt/utils.py

Lines changed: 50 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ def release(self):
191191
self.shape = None
192192

193193

194-
def allocate_buffers(context, stream=None, sync_mode=True):
194+
def allocate_buffers(context, stream=None, sync_mode=True, shared_mem={}):
195195
"""
196196
Read bindings' information in ExecutionContext, create pagelocked np.ndarray in CPU,
197197
allocate corresponding memory in GPU.
@@ -222,7 +222,10 @@ def allocate_buffers(context, stream=None, sync_mode=True):
222222

223223
inputs = []
224224
outputs = []
225+
out_pointer = 0
225226
has_dynamic_axes = False
227+
inv_shared_mem = {v: k for k, v in shared_mem.items()}
228+
226229
if stream is None and not sync_mode:
227230
stream = cuda.Stream()
228231
for binding in context.engine:
@@ -237,14 +240,27 @@ def allocate_buffers(context, stream=None, sync_mode=True):
237240
else:
238241
size = trt.volume(shape) * context.engine.max_batch_size
239242
# Allocate host and device buffers
240-
host_mem = cuda.pagelocked_empty(size, dtype)
241-
device_mem = cuda.mem_alloc(host_mem.nbytes)
243+
if not context.engine.binding_is_input(binding):
244+
if out_pointer in shared_mem.values():
245+
# avoid allocating memory in gpu, just pass the same device_mem and host that corresponds.
246+
input_idx = inv_shared_mem[out_pointer]
247+
device_mem = inputs[input_idx].device
248+
host_mem = inputs[input_idx].host
249+
else:
250+
host_mem = cuda.pagelocked_empty(size, dtype)
251+
device_mem = cuda.mem_alloc(host_mem.nbytes)
252+
out_pointer += 1
253+
254+
else:
255+
host_mem = cuda.pagelocked_empty(size, dtype)
256+
device_mem = cuda.mem_alloc(host_mem.nbytes)
242257
mem_obj = HostDeviceMem(host_mem, device_mem, shape, dtype, binding)
243258
# Append to the appropriate list.
244259
if context.engine.binding_is_input(binding):
245260
inputs.append(mem_obj)
246261
else:
247262
outputs.append(mem_obj)
263+
248264
return inputs, outputs, stream, has_dynamic_axes
249265

250266

@@ -305,7 +321,7 @@ def get_bindings(context, dict_inputs, dict_outputs):
305321
return bindings
306322

307323

308-
def execute_async(context, bindings, inputs, outputs, stream):
324+
def execute_async(context, bindings, inputs, outputs, stream, shared_mem, inputs_from_cpu, outputs_to_cpu):
309325
"""
310326
Execute an TensorRT engine.
311327
@@ -318,19 +334,32 @@ def execute_async(context, bindings, inputs, outputs, stream):
318334
outputs: list[HostDeviceMem]
319335
stream: pycuda.driver.Stream
320336
used for memory transfers between CPU-GPU
337+
inputs_from_cpu: bool, reload inputs from CPU again.
338+
outputs_from_cpu: bool, transfer back all outputs back from GPU to CPU.
321339
322340
Returns
323341
-------
324342
list : np.ndarray
325343
For each outputs of the engine
326344
"""
327345
# Transfer input data to the GPU.
328-
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
346+
if inputs_from_cpu:
347+
# Reload all inputs from "inputs"
348+
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
349+
else:
350+
# Reload all inputs from "inputs" except the ones with shared memory.
351+
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for i, inp in enumerate(inputs) if i not in shared_mem.keys()]
352+
329353
# Run inference.
330354
check = context.execute_async(bindings=bindings, stream_handle=stream.handle)
331355
assert check, "Kernel execution failed"
332356
# Transfer predictions back from the GPU.
333-
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
357+
if outputs_to_cpu:
358+
# All outputs
359+
[cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
360+
else:
361+
# only outputs with no memory shared
362+
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for i, out in enumerate(outputs) if i not in shared_mem.values()]
334363
# Synchronize the stream
335364
stream.synchronize()
336365
# Return only the host outputs.
@@ -339,7 +368,7 @@ def execute_async(context, bindings, inputs, outputs, stream):
339368
return [out.host for out in outputs]
340369

341370

342-
def execute_sync(context, bindings, inputs, outputs):
371+
def execute_sync(context, bindings, inputs, outputs, shared_mem, inputs_from_cpu, outputs_to_cpu):
343372
"""
344373
Execute an TensorRT engine.
345374
@@ -352,25 +381,36 @@ def execute_sync(context, bindings, inputs, outputs):
352381
outputs: list[HostDeviceMem]
353382
stream: pycuda.driver.Stream
354383
used for memory transfers between CPU-GPU
384+
inputs_from_cpu: bool, reload inputs from CPU again.
385+
outputs_to_cpu: bool, transfer back all outputs back from GPU to CPU.
355386
356387
Parameters
357388
----------
358389
list[np.ndarray] for each outputs of the engine
359390
"""
360391
# Transfer input data to the GPU.
361-
[cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
392+
if inputs_from_cpu:
393+
# Reload all inputs from "inputs".
394+
[cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
395+
else:
396+
# Reload all inputs from "inputs" except the ones with shared memory.
397+
[cuda.memcpy_htod(inp.device, inp.host) for i, inp in enumerate(inputs) if i not in shared_mem.keys()]
362398
# Run inference.
363399
check = context.execute_v2(bindings=bindings)
364400
assert check, "Kernel execution failed"
365401
# Transfer predictions back from the GPU.
366-
[cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
402+
if outputs_to_cpu:
403+
# All outputs
404+
[cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
405+
else:
406+
# only outputs with no memory shared
407+
[cuda.memcpy_dtoh(out.host, out.device) for i, out in enumerate(outputs) if i not in shared_mem.values()]
367408
# Return only the host outputs.
368409
for out in outputs:
369410
out.host = out.host.reshape(out.shape)
370411
return [out.host for out in outputs]
371412

372413

373-
374414
def rename_nodes_(graph, verbose=False):
375415

376416
dont_rename = [v.name for v in graph.inputs + graph.outputs]

0 commit comments

Comments
 (0)