Fix formats and address comments

HuiGao-NV · HuiGao-NV · commit 1e92f00f9ef4 · 2025-08-22T04:28:13.000Z
Signed-off-by: Hui Gao &lt;huig@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -600,9 +600,9 @@ def host_kv_cache_pool_mapping(self) -> Optional[torch.Tensor]:
 
     def __post_init__(self) -> None:
         super().__post_init__()
-        self.__post_init_with_buffers(self.cuda_graph_buffers)
+        self._post_init_with_buffers(self.cuda_graph_buffers)
 
-    def __post_init_with_buffers(self, buffers) -> None:
+    def _post_init_with_buffers(self, buffers) -> None:
 
         # Set a default value, as max_num_sequences is not always set.
         if self.max_num_sequences is None:
@@ -624,8 +624,6 @@ def get_empty(tensor_shape: list[int], dtype: torch.dtype,
             Args:
                 tensor_shape: The required shape.
                 dtype: The required dtype.
-                buffers: A dictionary mapping cache names to lists of buffer tensors.
-                        Can be `None` or empty.
                 cache_name: The key for the specific list of buffers to search in.
 
             Returns:
@@ -652,20 +650,26 @@ def get_empty(tensor_shape: list[int], dtype: torch.dtype,
 
         def get_empty_like(like_tensor: torch.Tensor,
                            cache_name: str) -> torch.Tensor:
-            return get_empty(like_tensor.shape,
-                             cache_name=cache_name,
-                             dtype=like_tensor.dtype)
+            return get_empty(
+                like_tensor.shape,
+                cache_name=cache_name,
+                dtype=like_tensor.dtype,
+            )
 
-        self.prompt_lens_cuda = get_empty((self.max_num_sequences, ),
-                                          cache_name="prompt_lens_cuda",
-                                          dtype=torch.int)
+        self.prompt_lens_cuda = get_empty(
+            (self.max_num_sequences, ),
+            cache_name="prompt_lens_cuda",
+            dtype=torch.int,
+        )
         self.prompt_lens_cpu = torch.empty_like(
             self.prompt_lens_cuda,
             device='cpu',
             pin_memory=True,
         )
-        self.kv_lens_cuda = get_empty_like(self.prompt_lens_cuda,
-                                           cache_name="kv_lens_cuda")
+        self.kv_lens_cuda = get_empty_like(
+            self.prompt_lens_cuda,
+            cache_name="kv_lens_cuda",
+        )
         self.kv_lens = torch.empty_like(self.kv_lens_cuda,
                                         device='cpu',
                                         pin_memory=True)
@@ -685,7 +689,8 @@ def get_empty_like(like_tensor: torch.Tensor,
                     self.kv_cache_manager.max_blocks_per_seq
                 ],
                 cache_name="kv_cache_block_offsets",
-                dtype=torch.int32)
+                dtype=torch.int32,
+            )
             self.host_kv_cache_block_offsets = torch.empty_like(
                 self.kv_cache_block_offsets,
                 device='cpu',
@@ -700,20 +705,23 @@ def get_empty_like(like_tensor: torch.Tensor,
                         self.kv_cache_manager.max_blocks_per_seq
                     ],
                     cache_name="block_ids_per_seq",
-                    dtype=torch.int32)
+                    dtype=torch.int32,
+                )
                 self.kv_block_ids_per_seq = get_empty(
                     [
                         self.kv_cache_manager.max_batch_size,
                         self.kv_cache_manager.max_blocks_per_seq
                     ],
                     cache_name="kv_block_ids_per_seq",
-                    dtype=torch.int32)
+                    dtype=torch.int32,
+                )
             if self.enable_paged_context_mla:
                 # for kv cache reuse/chunked context in MLA
                 self.ctx_cached_token_indptr = get_empty(
                     (self.max_num_requests + 1, ),
                     cache_name="ctx_cached_token_indptr",
-                    dtype=torch.int64)
+                    dtype=torch.int64,
+                )
                 self.host_ctx_cached_token_indptr = torch.zeros_like(
                     self.ctx_cached_token_indptr,
                     device='cpu',
@@ -722,16 +730,19 @@ def get_empty_like(like_tensor: torch.Tensor,
                 self.ctx_uncached_token_indptr = get_empty(
                     (self.max_num_requests + 1, ),
                     cache_name="ctx_uncached_token_indptr",
-                    dtype=torch.int64)
+                    dtype=torch.int64,
+                )
                 self.host_ctx_uncached_token_indptr = torch.zeros_like(
                     self.ctx_uncached_token_indptr,
                     device='cpu',
                     pin_memory=True,
                 )
                 # context full seqlens include cached tokens and uncached tokens
-                self.ctx_kv_indptr = get_empty((self.max_num_requests + 1, ),
-                                               cache_name="ctx_kv_indptr",
-                                               dtype=torch.int64)
+                self.ctx_kv_indptr = get_empty(
+                    (self.max_num_requests + 1, ),
+                    cache_name="ctx_kv_indptr",
+                    dtype=torch.int64,
+                )
                 self.host_ctx_kv_indptr = torch.zeros_like(
                     self.ctx_kv_indptr,
                     device='cpu',
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -278,7 +278,6 @@ full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen1.5_7b
 full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
 full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_vl_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5359696)
 full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5410391)
 accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8 SKIP (https://nvbugs/5413197)
 accuracy/test_cli_flow.py::TestLlama3_8BInstructGradient1048k::test_long_context_ppl SKIP (https://nvbugs/5413362)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5455140)