Skip to content

Commit 4d7dfe6

Browse files
xuanzicdominicshanshan
authored andcommitted
[TRTLLM-6104] feat: add request_perf_metrics to triton LLMAPI backend (NVIDIA#5554)
Signed-off-by: Vivian Chen <[email protected]>
1 parent 2a70ddc commit 4d7dfe6

File tree

4 files changed

+166
-2
lines changed

4 files changed

+166
-2
lines changed

triton_backend/all_models/llmapi/tensorrt_llm/1/helpers.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,24 @@ def get_sampling_params_from_request(request, batch_size=1, batch_index=0):
2424
Used in llmapi/tensorrt_llm
2525
"""
2626
sampling_params_args = [
27-
'best_of', 'temperature', 'top_k', 'top_p', 'frequency_penalty',
28-
'presence_penalty', 'max_tokens', 'seed', 'exclude_input_from_output'
27+
'best_of',
28+
'temperature',
29+
'top_k',
30+
'top_p',
31+
'frequency_penalty',
32+
'presence_penalty',
33+
'max_tokens',
34+
'seed',
35+
'exclude_input_from_output',
36+
'return_perf_metrics',
2937
]
3038
param_mappings = {}
3139
for arg in sampling_params_args:
3240
param_mappings[f"sampling_param_{arg}"] = arg
3341
default_values = {
3442
'sampling_param_best_of': 1,
3543
'sampling_param_exclude_input_from_output': False,
44+
'sampling_param_return_perf_metrics': False,
3645
}
3746
kwargs = convert_request_input_to_dict(request, param_mappings,
3847
default_values, batch_size,

triton_backend/all_models/llmapi/tensorrt_llm/1/model.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from contextlib import asynccontextmanager
3434

3535
import numpy as np
36+
import pandas as pd
3637
import triton_python_backend_utils as pb_utils
3738
import yaml
3839
from helpers import (get_input_tensor_by_name, get_output_config_from_request,
@@ -450,6 +451,92 @@ def _create_response(self, request_output, output_config):
450451
pb_utils.Tensor(output_name,
451452
np.asarray(tensor_data, dtype=np.object_)))
452453

454+
if hasattr(request_output.outputs[0], 'request_perf_metrics'
455+
) and request_output.outputs[0].request_perf_metrics:
456+
457+
perf_metrics = request_output.outputs[0].request_perf_metrics
458+
459+
# kv cache perf metrics per request
460+
kv_metrics = perf_metrics.kv_cache_metrics
461+
462+
response.append(
463+
pb_utils.Tensor(
464+
"kv_cache_reused_block",
465+
np.asarray([kv_metrics.num_reused_blocks],
466+
dtype=self.output_dtype)))
467+
response.append(
468+
pb_utils.Tensor(
469+
"kv_cache_hit_rate",
470+
np.asarray([kv_metrics.kv_cache_hit_rate],
471+
dtype=self.output_dtype)))
472+
response.append(
473+
pb_utils.Tensor(
474+
"kv_cache_alloc_new_blocks",
475+
np.asarray([kv_metrics.num_new_allocated_blocks],
476+
dtype=self.output_dtype)))
477+
response.append(
478+
pb_utils.Tensor(
479+
"kv_cache_alloc_total_blocks",
480+
np.asarray([kv_metrics.num_total_allocated_blocks],
481+
dtype=self.output_dtype)))
482+
response.append(
483+
pb_utils.Tensor(
484+
"kv_cache_missed_block",
485+
np.asarray([kv_metrics.num_missed_blocks],
486+
dtype=self.output_dtype)))
487+
488+
# timing perf metrics per request
489+
timing_metrics = perf_metrics.timing_metrics
490+
response.append(
491+
pb_utils.Tensor(
492+
"arrival_time_ns",
493+
np.asarray(
494+
[pd.Timedelta(timing_metrics.arrival_time).value],
495+
dtype=self.output_dtype)))
496+
497+
response.append(
498+
pb_utils.Tensor(
499+
"first_scheduled_time_ns",
500+
np.asarray([
501+
pd.Timedelta(timing_metrics.first_scheduled_time).value
502+
],
503+
dtype=self.output_dtype)))
504+
505+
response.append(
506+
pb_utils.Tensor(
507+
"first_token_time_ns",
508+
np.asarray(
509+
[pd.Timedelta(timing_metrics.first_token_time).value],
510+
dtype=self.output_dtype)))
511+
512+
response.append(
513+
pb_utils.Tensor(
514+
"last_token_time_ns",
515+
np.asarray(
516+
[pd.Timedelta(timing_metrics.last_token_time).value],
517+
dtype=self.output_dtype)))
518+
519+
#spec dec perf metrics per request
520+
spec_dec_metrics = perf_metrics.speculative_decoding
521+
522+
response.append(
523+
pb_utils.Tensor(
524+
"acceptance_rate",
525+
np.asarray([spec_dec_metrics.acceptance_rate],
526+
dtype=self.output_dtype)))
527+
528+
response.append(
529+
pb_utils.Tensor(
530+
"total_accepted_draft_tokens",
531+
np.asarray([spec_dec_metrics.total_accepted_draft_tokens],
532+
dtype=self.output_dtype)))
533+
534+
response.append(
535+
pb_utils.Tensor(
536+
"total_draft_tokens",
537+
np.asarray([spec_dec_metrics.total_draft_tokens],
538+
dtype=self.output_dtype)))
539+
453540
return pb_utils.InferenceResponse(output_tensors=response)
454541

455542
def finalize(self):

triton_backend/all_models/llmapi/tensorrt_llm/config.pbtxt

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,12 @@ input [
118118
dims: [ 1 ]
119119
optional: true
120120
},
121+
{
122+
name: "sampling_param_return_perf_metrics"
123+
data_type: TYPE_BOOL
124+
dims: [ 1 ]
125+
optional: true
126+
},
121127
## Arguments for Controlling Response Output Fields ##
122128
{
123129
name: "return_finish_reason"
@@ -161,5 +167,65 @@ output [
161167
name: "cumulative_logprob"
162168
data_type: TYPE_FP32
163169
dims: [-1]
170+
},
171+
{
172+
name: "kv_cache_reused_block"
173+
data_type: TYPE_INT32
174+
dims: [-1]
175+
},
176+
{
177+
name: "kv_cache_missed_block"
178+
data_type: TYPE_INT32
179+
dims: [-1]
180+
},
181+
{
182+
name: "kv_cache_alloc_new_blocks"
183+
data_type: TYPE_INT32
184+
dims: [-1]
185+
},
186+
{
187+
name: "kv_cache_alloc_total_blocks"
188+
data_type: TYPE_INT32
189+
dims: [-1]
190+
},
191+
{
192+
name: "kv_cache_hit_rate"
193+
data_type: TYPE_FP32
194+
dims: [-1]
195+
},
196+
{
197+
name: "arrival_time_ns"
198+
data_type: TYPE_INT64
199+
dims: [ 1 ]
200+
},
201+
{
202+
name: "first_scheduled_time_ns"
203+
data_type: TYPE_INT64
204+
dims: [ 1 ]
205+
},
206+
{
207+
name: "first_token_time_ns"
208+
data_type: TYPE_INT64
209+
dims: [ 1 ]
210+
},
211+
{
212+
name: "last_token_time_ns"
213+
data_type: TYPE_INT64
214+
dims: [ 1 ]
215+
},
216+
{
217+
name: "acceptance_rate"
218+
data_type: TYPE_FP32
219+
dims: [ 1 ]
220+
},
221+
{
222+
name: "total_accepted_draft_tokens"
223+
data_type: TYPE_INT32
224+
dims: [ 1 ]
225+
},
226+
{
227+
name: "total_draft_tokens"
228+
data_type: TYPE_INT32
229+
dims: [ 1 ]
164230
}
165231
]

triton_backend/all_models/tests/test_llmapi_python_backend.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ def inputs(streaming=False):
143143
"sampling_param_seed": [2],
144144
"return_finish_reason": [True],
145145
"return_stop_reason": [True],
146+
"sampling_param_return_perf_metrics": [True]
146147
}
147148

148149

@@ -164,6 +165,7 @@ def test_get_sampling_params_from_request():
164165
assert config["frequency_penalty"] == 0.0
165166
assert config["presence_penalty"] == 0.0
166167
assert config["seed"] == 2
168+
assert config["return_perf_metrics"] == True
167169
assert np.array_equal(config["stop"], np.array(['\n', 'stop']))
168170

169171

0 commit comments

Comments
 (0)