|
1 | 1 | from pathlib import Path
|
2 | 2 |
|
| 3 | +import defs.ci_profiler |
3 | 4 | import pytest
|
4 | 5 | from defs.common import convert_weights, venv_check_call, venv_mpi_check_call
|
5 | 6 | from defs.conftest import get_device_memory, get_sm_version
|
6 | 7 | from defs.trt_test_alternative import check_call
|
7 | 8 |
|
| 9 | +from tensorrt_llm import LLM |
| 10 | +from tensorrt_llm.executor.request import LoRARequest |
| 11 | +from tensorrt_llm.lora_manager import LoraConfig |
| 12 | +from tensorrt_llm.sampling_params import SamplingParams |
| 13 | + |
8 | 14 | # skip trt flow cases on post-Blackwell-Ultra
|
9 | 15 | if get_sm_version() >= 103:
|
10 | 16 | pytest.skip(
|
@@ -122,3 +128,71 @@ def test_nemotron_nas_summary_2gpu(nemotron_nas_example_root, llm_venv,
|
122 | 128 | ]
|
123 | 129 |
|
124 | 130 | venv_mpi_check_call(llm_venv, mpi_cmd, summary_cmd)
|
| 131 | + |
| 132 | + |
| 133 | +@pytest.mark.skip_less_device(4) |
| 134 | +@pytest.mark.skip_less_device_memory(80000) |
| 135 | +@pytest.mark.parametrize("nemotron_nas_model_root", [ |
| 136 | + "Llama-3_3-Nemotron-Super-49B-v1", |
| 137 | +], |
| 138 | + indirect=True) |
| 139 | +def test_nemotron_super_49b_real_lora_torch(nemotron_nas_example_root, llm_venv, |
| 140 | + nemotron_nas_model_root, |
| 141 | + llm_datasets_root, llm_rouge_root, |
| 142 | + engine_dir, cmodel_dir): |
| 143 | + """Run Nemotron Super 49B with real LoRA adapters using LLM-API Torch backend.""" |
| 144 | + |
| 145 | + print("Testing Nemotron Super 49B with real LoRA adapters...") |
| 146 | + |
| 147 | + lora_adapter_path = f"/code/tensorrt_llm/llama-3.3-nemotron-super-49b-v1/llama-3.3-nemotron-super-49b-v1_vlora-1a2cb80-v2" |
| 148 | + print(f"Using real LoRA from: {lora_adapter_path}") |
| 149 | + |
| 150 | + defs.ci_profiler.start("test_nemotron_real_lora_torch") |
| 151 | + |
| 152 | + lora_config = LoraConfig( |
| 153 | + lora_dir=[lora_adapter_path], |
| 154 | + max_lora_rank=32, # From adapter_config.json: "r": 32 |
| 155 | + max_loras=1, |
| 156 | + max_cpu_loras=1, |
| 157 | + ) |
| 158 | + |
| 159 | + with LLM(model=nemotron_nas_model_root, |
| 160 | + lora_config=lora_config, |
| 161 | + tensor_parallel_size=4, |
| 162 | + dtype="bfloat16", |
| 163 | + max_batch_size=2, |
| 164 | + max_input_len=512, |
| 165 | + max_seq_len=1024, |
| 166 | + max_beam_width=1) as llm: |
| 167 | + |
| 168 | + prompts = [ |
| 169 | + "What is the capital of France?", |
| 170 | + "Explain quantum computing in simple terms." |
| 171 | + ] |
| 172 | + |
| 173 | + sampling_params = SamplingParams(max_tokens=50, |
| 174 | + temperature=0.7, |
| 175 | + top_p=0.9) |
| 176 | + |
| 177 | + lora_request = [LoRARequest("nemotron-lora", 0, lora_adapter_path)] |
| 178 | + |
| 179 | + print("Running inference with real LoRA adapter...") |
| 180 | + outputs = llm.generate(prompts, |
| 181 | + sampling_params, |
| 182 | + lora_request=lora_request) |
| 183 | + |
| 184 | + for i, output in enumerate(outputs): |
| 185 | + print(f"Prompt {i+1}: {prompts[i]}") |
| 186 | + print(f"Response {i+1}: {output.outputs[0].text}") |
| 187 | + print("-" * 50) |
| 188 | + |
| 189 | + assert len(outputs) == 2 |
| 190 | + assert len(outputs[0].outputs) > 0 |
| 191 | + assert len(outputs[1].outputs) > 0 |
| 192 | + assert len(outputs[0].outputs[0].text) > 0 |
| 193 | + assert len(outputs[1].outputs[0].text) > 0 |
| 194 | + |
| 195 | + defs.ci_profiler.stop("test_nemotron_real_lora_torch") |
| 196 | + print( |
| 197 | + f"test_nemotron_real_lora_torch: {defs.ci_profiler.elapsed_time_in_sec('test_nemotron_real_lora_torch')} sec" |
| 198 | + ) |
0 commit comments