NVIDIA · venkywonka · Jul 30, 2025 · Jul 30, 2025
diff --git a/tests/integration/defs/common.py b/tests/integration/defs/common.py
@@ -16,6 +16,7 @@
 import os
 import platform
 import re
+import time
 from difflib import SequenceMatcher
 from pathlib import Path
 
@@ -771,16 +772,23 @@ def test_multi_lora_support(
     zero_lora_weights=True,
     use_code_prompts=False,
 ):
+    start_time = time.time()
     print("Creating dummy LoRAs...")
+    lora_start = time.time()
     lora_paths = generate_dummy_loras(
         hf_model_dir=hf_model_dir,
         lora_output_dir=llm_venv.get_working_directory(),
         num_loras=num_loras,
         lora_rank=lora_rank,
         target_modules=target_hf_modules,
         zero_weights=zero_lora_weights)
+    lora_end = time.time()
+    print(
+        f"Creating dummy LoRAs completed in {(lora_end - lora_start):.2f} seconds."
+    )
 
     print("Build engines...")
+    build_start = time.time()
     build_cmd = [
         "trtllm-build",
         f"--checkpoint_dir={tllm_ckpt_dir}",
@@ -801,6 +809,9 @@ def test_multi_lora_support(
         "--max_beam_width=1",
     ]
     check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
+    build_end = time.time()
+    print(
+        f"Build engines completed in {(build_end - build_start):.2f} seconds.")
 
     if use_code_prompts:
         input_prompts = [
@@ -822,6 +833,7 @@ def test_multi_lora_support(
         ]
 
     print("Run inference with C++ runtime with pybind...")
+    inference_start = time.time()
     run_script = f"{example_root}/../../../run.py" if "core" in example_root else f"{example_root}/../run.py"
     run_cmd = [
         run_script,
@@ -842,6 +854,15 @@ def test_multi_lora_support(
         "--max_output_len=30",
     ]
     venv_check_call(llm_venv, run_cmd)
+    inference_end = time.time()
+    print(
+        f"Inference completed in {(inference_end - inference_start):.2f} seconds."
+    )
+
+    total_time = time.time() - start_time
+    print(
+        f"Total test_multi_lora_support execution time: {total_time:.2f} seconds"
+    )
 
 
 def get_dummy_spec_decoding_heads(hf_model_dir,

diff --git a/tests/integration/defs/examples/test_gemma.py b/tests/integration/defs/examples/test_gemma.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import time
 from pathlib import Path
 
 import pytest
@@ -429,7 +430,9 @@ def test_hf_gemma_fp8_base_bf16_multi_lora(gemma_model_root,
                                            batch_size=8):
     "Run Gemma models with multiple dummy LoRAs."
 
+    start_time = time.time()
     print("Convert checkpoint by modelopt...")
+    convert_start = time.time()
     kv_cache_dtype = 'fp8' if qformat == 'fp8' else 'int8'
     convert_cmd = [
         f"{gemma_example_root}/../../../quantization/quantize.py",
@@ -441,7 +444,13 @@ def test_hf_gemma_fp8_base_bf16_multi_lora(gemma_model_root,
         f"--output_dir={cmodel_dir}",
     ]
     venv_check_call(llm_venv, convert_cmd)
+    convert_end = time.time()
+    print(
+        f"Convert checkpoint completed in {(convert_end - convert_start):.2f} seconds."
+    )
 
+    test_multi_lora_start = time.time()
+    print("Calling test_multi_lora_support...")
     test_multi_lora_support(
         hf_model_dir=gemma_model_root,
         tllm_ckpt_dir=cmodel_dir,
@@ -454,3 +463,10 @@ def test_hf_gemma_fp8_base_bf16_multi_lora(gemma_model_root,
         target_trtllm_modules=["attn_q", "attn_k", "attn_v"],
         zero_lora_weights=True,
     )
+    test_multi_lora_end = time.time()
+    print(
+        f"test_multi_lora_support completed in {(test_multi_lora_end - test_multi_lora_start):.2f} seconds"
+    )
+
+    total_time = time.time() - start_time
+    print(f"Total function execution time: {total_time:.2f} seconds")
diff --git a/tests/integration/defs/examples/test_granite.py b/tests/integration/defs/examples/test_granite.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import os
+import time
 
 import pytest
 from defs.common import (convert_weights, test_multi_lora_support,
@@ -96,7 +97,9 @@ def test_granite_bf16_lora(llama_example_root,
     "Run Granite 3.0 models with multiple dummy LoRAs."
 
     # TODO: Enable fp8 quantization when ModelOpt changes for Granite are available.
+    start_time = time.time()
     print("Converting checkpoint...")
+    convert_start = time.time()
     model_name = os.path.basename(llm_granite_model_root)
     dtype = 'bfloat16'
 
@@ -108,6 +111,11 @@ def test_granite_bf16_lora(llama_example_root,
         model_path=llm_granite_model_root,
         data_type=dtype,
     )
+    convert_end = time.time()
+    print(
+        f"Convert checkpoint completed in {(convert_end - convert_start):.2f} seconds."
+    )
+
     target_hf_modules = [
         "q_proj",
         "k_proj",
@@ -122,6 +130,8 @@ def test_granite_bf16_lora(llama_example_root,
         target_hf_modules += ["moe_h_to_4h", "moe_4h_to_h", "moe_gate"]
         target_trtllm_modules += ["moe_h_to_4h", "moe_4h_to_h", "moe_gate"]
 
+    print("Calling test_multi_lora_support...")
+    test_multi_lora_start = time.time()
     test_multi_lora_support(
         hf_model_dir=llm_granite_model_root,
         tllm_ckpt_dir=ckpt_dir,
@@ -134,3 +144,10 @@ def test_granite_bf16_lora(llama_example_root,
         target_trtllm_modules=target_trtllm_modules,
         zero_lora_weights=True,
     )
+    test_multi_lora_end = time.time()
+    print(
+        f"test_multi_lora_support completed in {(test_multi_lora_end - test_multi_lora_start):.2f} seconds"
+    )
+
+    total_time = time.time() - start_time
+    print(f"Total function execution time: {total_time:.2f} seconds")
diff --git a/tests/integration/test_lists/test-db/l0_l40s.yml b/tests/integration/test_lists/test-db/l0_l40s.yml
@@ -106,6 +106,6 @@ l0_l40s:
   - examples/test_redrafter.py::test_llm_redrafter_1gpu[use_py_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb5-bs8]
   - examples/test_redrafter.py::test_llm_redrafter_1gpu[use_py_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb8-bs8]
   - examples/test_granite.py::test_granite_bf16_lora[granite-3.0-2b-instruct]
-  - examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct]
+  - examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] TIMEOUT (90)
   - examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-v3-8b-instruct-hf]
   - examples/test_qwen.py::test_llm_qwen1_5_7b_single_gpu_lora[qwen1.5_7b_chat-Qwen1.5-7B-Chat-750Mb-lora]
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -393,9 +393,7 @@ examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoderplus] SKIP (
 examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5360086)
 examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5360086)
 test_e2e.py::test_trtllm_bench_llmapi_launch[trt_backend-llama-v3-llama3-8b] SKIP (https://nvbugs/5320234)
-examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] SKIP (https://nvbugs/5374145)
 stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test] SKIP (https://nvbugs/5375646)
-examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5376087)
 full:GH200/disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5375966)
 accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5375620)
 test_e2e.py::test_ptp_quickstart_advanced[Mixtral-8x7B-NVFP4-nvfp4-quantized/Mixtral-8x7B-Instruct-v0.1] SKIP (https://nvbugs/5377465)