From 3e2d2a562d67c1e8c4c31282e70152f806986a67 Mon Sep 17 00:00:00 2001 From: Frank Di Natale <3429989+FrankD412@users.noreply.github.com> Date: Wed, 20 Aug 2025 14:16:38 -0700 Subject: [PATCH 1/6] Fix KV cache reporting. Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com> --- tensorrt_llm/bench/dataclasses/reporting.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/bench/dataclasses/reporting.py b/tensorrt_llm/bench/dataclasses/reporting.py index acf7f60bcbb..9aad2e7845c 100755 --- a/tensorrt_llm/bench/dataclasses/reporting.py +++ b/tensorrt_llm/bench/dataclasses/reporting.py @@ -306,8 +306,11 @@ def get_statistics_dict(self) -> Dict[str, Any]: KvCacheConfig()) if isinstance(kv_cache_config, KvCacheConfig): kv_cache_dtype = kv_cache_config.dtype + kv_cache_mem_percent = kv_cache_config.free_gpu_memory_fraction elif isinstance(kv_cache_config, dict): kv_cache_dtype = kv_cache_config.get("dtype", "auto") + kv_cache_mem_percent = kv_cache_config.get( + "free_gpu_memory_fraction") else: raise ValueError( f"Invalid kv_cache_config type: {type(kv_cache_config)}.") @@ -336,8 +339,7 @@ def get_statistics_dict(self) -> Dict[str, Any]: "max_batch_size": self.rt_cfg.settings_config.max_batch_size, "max_num_tokens": self.rt_cfg.settings_config.max_num_tokens, "scheduling_policy": self.rt_cfg.settings_config.scheduler_policy, - "kv_cache_percentage": - self.rt_cfg.settings_config.kv_cache_percent * 100.0, + "kv_cache_percentage": kv_cache_mem_percent * 100.0, "issue_rate": self.convert_rate_to_s(self.statistics.issue_rate_ns) } From 53830e0c803fb96f6d530af2754e65c813e151fe Mon Sep 17 00:00:00 2001 From: Frank Di Natale <3429989+FrankD412@users.noreply.github.com> Date: Wed, 20 Aug 2025 15:24:03 -0700 Subject: [PATCH 2/6] Further tweaks for KV info. Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com> --- tensorrt_llm/bench/dataclasses/reporting.py | 25 +++++++++++---------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/tensorrt_llm/bench/dataclasses/reporting.py b/tensorrt_llm/bench/dataclasses/reporting.py index 9aad2e7845c..d8f95b58a01 100755 --- a/tensorrt_llm/bench/dataclasses/reporting.py +++ b/tensorrt_llm/bench/dataclasses/reporting.py @@ -273,6 +273,19 @@ def get_statistics_dict(self) -> Dict[str, Any]: }, } + # Retrieve KV cache information. + kv_cache_config = self.kwargs.get("kv_cache_config", KvCacheConfig()) + if isinstance(kv_cache_config, KvCacheConfig): + kv_cache_dtype = kv_cache_config.dtype + kv_cache_mem_percent = kv_cache_config.free_gpu_memory_fraction + elif isinstance(kv_cache_config, dict): + kv_cache_dtype = kv_cache_config.get("dtype", "auto") + kv_cache_mem_percent = kv_cache_config.get( + "free_gpu_memory_fraction") + else: + raise ValueError( + f"Invalid kv_cache_config type: {type(kv_cache_config)}.") + # Engine/Backend details if self.rt_cfg.backend not in ('pytorch', '_autodeploy'): config_path = self.rt_cfg.engine_dir / "config.json" @@ -302,18 +315,6 @@ def get_statistics_dict(self) -> Dict[str, Any]: model = self.rt_cfg.model_path or self.rt_cfg.model model_config = ModelConfig.from_pretrained(model, trust_remote_code=True) - kv_cache_config = self.kwargs.get("kv_cache_config", - KvCacheConfig()) - if isinstance(kv_cache_config, KvCacheConfig): - kv_cache_dtype = kv_cache_config.dtype - kv_cache_mem_percent = kv_cache_config.free_gpu_memory_fraction - elif isinstance(kv_cache_config, dict): - kv_cache_dtype = kv_cache_config.get("dtype", "auto") - kv_cache_mem_percent = kv_cache_config.get( - "free_gpu_memory_fraction") - else: - raise ValueError( - f"Invalid kv_cache_config type: {type(kv_cache_config)}.") validate_and_set_kv_cache_quant(model_config, kv_cache_dtype) From 4ce53b30c88babb49a4679506c3d8683fb548d65 Mon Sep 17 00:00:00 2001 From: Frank Di Natale <3429989+FrankD412@users.noreply.github.com> Date: Wed, 20 Aug 2025 18:59:48 -0700 Subject: [PATCH 3/6] Handle none case. Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com> --- tensorrt_llm/bench/dataclasses/reporting.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tensorrt_llm/bench/dataclasses/reporting.py b/tensorrt_llm/bench/dataclasses/reporting.py index d8f95b58a01..17945bf7e05 100755 --- a/tensorrt_llm/bench/dataclasses/reporting.py +++ b/tensorrt_llm/bench/dataclasses/reporting.py @@ -286,6 +286,9 @@ def get_statistics_dict(self) -> Dict[str, Any]: raise ValueError( f"Invalid kv_cache_config type: {type(kv_cache_config)}.") + kv_cache_mem_percent = kv_cache_mem_percent * 100.0 \ + if kv_cache_mem_percent is not None else "None" + # Engine/Backend details if self.rt_cfg.backend not in ('pytorch', '_autodeploy'): config_path = self.rt_cfg.engine_dir / "config.json" From ce4b5eec7c05acf013f96e83c21fde6d81cd22f4 Mon Sep 17 00:00:00 2001 From: Frank Di Natale <3429989+FrankD412@users.noreply.github.com> Date: Wed, 20 Aug 2025 21:51:30 -0700 Subject: [PATCH 4/6] Fix operator priority. Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com> --- tensorrt_llm/bench/dataclasses/reporting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorrt_llm/bench/dataclasses/reporting.py b/tensorrt_llm/bench/dataclasses/reporting.py index 17945bf7e05..ba9eaa807f3 100755 --- a/tensorrt_llm/bench/dataclasses/reporting.py +++ b/tensorrt_llm/bench/dataclasses/reporting.py @@ -286,7 +286,7 @@ def get_statistics_dict(self) -> Dict[str, Any]: raise ValueError( f"Invalid kv_cache_config type: {type(kv_cache_config)}.") - kv_cache_mem_percent = kv_cache_mem_percent * 100.0 \ + kv_cache_mem_percent = (kv_cache_mem_percent * 100.0) \ if kv_cache_mem_percent is not None else "None" # Engine/Backend details From 80a7cc28d3d42d28017d1ac5f954c4c24f3ef480 Mon Sep 17 00:00:00 2001 From: Frank Di Natale <3429989+FrankD412@users.noreply.github.com> Date: Thu, 21 Aug 2025 12:27:02 -0700 Subject: [PATCH 5/6] Fix missed multiplication. Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com> --- tensorrt_llm/bench/dataclasses/reporting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorrt_llm/bench/dataclasses/reporting.py b/tensorrt_llm/bench/dataclasses/reporting.py index ba9eaa807f3..8f10ddfc90a 100755 --- a/tensorrt_llm/bench/dataclasses/reporting.py +++ b/tensorrt_llm/bench/dataclasses/reporting.py @@ -343,7 +343,7 @@ def get_statistics_dict(self) -> Dict[str, Any]: "max_batch_size": self.rt_cfg.settings_config.max_batch_size, "max_num_tokens": self.rt_cfg.settings_config.max_num_tokens, "scheduling_policy": self.rt_cfg.settings_config.scheduler_policy, - "kv_cache_percentage": kv_cache_mem_percent * 100.0, + "kv_cache_percentage": kv_cache_mem_percent, "issue_rate": self.convert_rate_to_s(self.statistics.issue_rate_ns) } From 46449e2817107bc2fc3decc9d6caa51568e34a27 Mon Sep 17 00:00:00 2001 From: Frank Di Natale <3429989+FrankD412@users.noreply.github.com> Date: Thu, 21 Aug 2025 23:57:30 -0700 Subject: [PATCH 6/6] Update to string formatting. Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com> --- tensorrt_llm/bench/dataclasses/reporting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/bench/dataclasses/reporting.py b/tensorrt_llm/bench/dataclasses/reporting.py index 8f10ddfc90a..fd76466cd5a 100755 --- a/tensorrt_llm/bench/dataclasses/reporting.py +++ b/tensorrt_llm/bench/dataclasses/reporting.py @@ -286,7 +286,7 @@ def get_statistics_dict(self) -> Dict[str, Any]: raise ValueError( f"Invalid kv_cache_config type: {type(kv_cache_config)}.") - kv_cache_mem_percent = (kv_cache_mem_percent * 100.0) \ + kv_cache_mem_percent = f"{kv_cache_mem_percent * 100.0:.2f}%" \ if kv_cache_mem_percent is not None else "None" # Engine/Backend details @@ -532,7 +532,7 @@ def report_statistics(self) -> None: f"Max Runtime Batch Size: {world_info['max_batch_size']}\n" f"Max Runtime Tokens: {world_info['max_num_tokens']}\n" f"Scheduling Policy: {world_info['scheduling_policy']}\n" - f"KV Memory Percentage: {world_info['kv_cache_percentage']:.2f}%\n" + f"KV Memory Percentage: {world_info['kv_cache_percentage']}\n" f"Issue Rate (req/sec): {world_info['issue_rate']:.4E}\n" f"\n")