Skip to content

Commit a73d28d

Browse files
committed
Fix nanobind build
Signed-off-by: Patrice Castonguay <[email protected]>
1 parent 0c1a2ea commit a73d28d

File tree

4 files changed

+74
-3
lines changed

4 files changed

+74
-3
lines changed

cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ void initConfigBindings(nb::module_& m)
129129
.def(nb::init<bool, std::optional<SizeType32> const&, std::optional<std::vector<SizeType32>> const&,
130130
std::optional<SizeType32> const&, std::optional<float> const&, std::optional<size_t> const&, bool,
131131
std::optional<float> const&, std::optional<tle::RetentionPriority>, size_t const&, bool, bool, bool,
132-
std::optional<RuntimeDefaults> const&>(),
132+
SizeType32, std::optional<RuntimeDefaults> const&>(),
133133
nb::arg("enable_block_reuse") = true, nb::arg("max_tokens") = nb::none(),
134134
nb::arg("max_attention_window") = nb::none(), nb::arg("sink_token_length") = nb::none(),
135135
nb::arg("free_gpu_memory_fraction") = nb::none(), nb::arg("host_cache_size") = nb::none(),

tensorrt_llm/_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1017,7 +1017,7 @@ def to_json_str(cls, event):
10171017
"event_id": event.event_id,
10181018
"data": event_serialize_func(event.data),
10191019
"window_size": event.window_size,
1020-
"attention_dp_rank": event_serialize_func(event.attention_dp_rank),
1020+
"attention_dp_rank": event.attention_dp_rank,
10211021
}
10221022

10231023
@staticmethod

tests/integration/test_lists/test-db/l0_dgx_h200.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ l0_dgx_h200:
136136
- unittest/llmapi/test_llm_multi_gpu.py -m "gpu2 and part3"
137137
- unittest/llmapi/test_llm_multi_gpu.py -m "gpu4 and part0"
138138
- unittest/llmapi/test_llm_multi_gpu.py -m "not (gpu2 or gpu4)"
139+
- unittest/llmapi/test_llm_kv_cache_events.py::test_llm_api_attention_dp_kv_events
139140
- examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-enable_fp8]
140141
- llmapi/test_llm_e2e.py::test_llmapi_exit_multi_gpu
141142
- test_e2e.py::test_trtllm_bench_llmapi_launch[trt_backend-llama-v3-llama3-8b]
@@ -151,7 +152,7 @@ l0_dgx_h200:
151152
- examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion]
152153
- examples/test_recurrentgemma.py::test_llm_recurrentgemma_2gpu[recurrentgemma-2b]
153154
- examples/test_mamba.py::test_llm_mamba2_2gpu[mamba-codestral-7B-v0.1]
154-
- accuracy/test_cli_flow.py::TestLlama2_7B::test_smooth_quant_ootb_tp2
155+
- accuracy/test_cli_flow.py::TestLlama2_7B::eest_smooth_quant_ootb_tp2
155156
- accuracy/test_cli_flow.py::TestLlama2_7B::test_int4_awq_tp2
156157
- accuracy/test_cli_flow.py::TestLlama2_7B::test_int4_awq_prequantized_tp2
157158
- accuracy/test_cli_flow.py::TestLlama2_7B::test_int4_gptq_prequantized_tp2

tests/unittest/llmapi/test_llm_kv_cache_events.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import asyncio
22
import time
33

4+
from utils.util import skip_single_gpu
5+
46
import tensorrt_llm
57
from tensorrt_llm import LLM
68
from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequest
@@ -203,3 +205,71 @@ def test_llm_kv_events_api():
203205

204206
# no more events after request is finished
205207
assert not llm.get_kv_cache_events(5)
208+
209+
210+
@skip_single_gpu
211+
def test_llm_api_attention_dp_kv_events():
212+
llm = LLM(model=llama_model_path,
213+
tensor_parallel_size=2,
214+
enable_attention_dp=True,
215+
kv_cache_config=global_kvcache_config,
216+
enable_autotuner=False)
217+
218+
sampling_params = SamplingParams(max_tokens=6, temperature=0.01)
219+
220+
requests = []
221+
for i in range(3):
222+
input_tokens = list(range(127 + i))[i:]
223+
requests.append(input_tokens)
224+
225+
_ = llm.generate(requests[0], sampling_params=sampling_params)
226+
events1 = llm.get_kv_cache_events(5)
227+
228+
# Should have 1 stored event and 1 created event
229+
event = events1.pop(0) # created event
230+
while events1:
231+
event = events1.pop(0)
232+
if event:
233+
assert event["event_id"] == 1
234+
assert event["data"]["type"] == "stored"
235+
assert event["attention_dp_rank"] == 0
236+
assert event["window_size"] == 32
237+
assert len(event["data"]["blocks"]) == 5
238+
239+
_ = llm.generate(requests[1], sampling_params=sampling_params)
240+
events2 = llm.get_kv_cache_events(5)
241+
242+
while events2:
243+
event = events2.pop(0)
244+
if event:
245+
if event["event_id"] == 2:
246+
# 2 removed events needed
247+
# should be a removed event to make space for context block
248+
assert event["data"]["type"] == "removed"
249+
assert event["data"]["block_hashes"]
250+
elif event["event_id"] == 3:
251+
assert event["data"]["type"] == "removed"
252+
assert event["data"]["block_hashes"]
253+
# stored event for 2nd request
254+
elif event["event_id"] == 4:
255+
assert event["data"]["type"] == "stored"
256+
assert len(event["data"]["blocks"]) == 5
257+
258+
#_ = llm.generate(requests[2], sampling_params=sampling_params)
259+
#events3 = llm.get_kv_cache_events(5)
260+
261+
#while events3:
262+
# event = events3.pop(0)
263+
# if event:
264+
# if event["event_id"] == 5:
265+
# assert event["data"]["type"] == "removed"
266+
# assert event["data"]["block_hashes"]
267+
# elif event["event_id"] == 6:
268+
# assert event["data"]["type"] == "removed"
269+
# assert event["data"]["block_hashes"]
270+
# elif event["event_id"] == 7:
271+
# assert event["data"]["type"] == "stored"
272+
# assert len(event["data"]["blocks"]) == 5
273+
274+
## no more events after request is finished
275+
#assert not llm.get_kv_cache_events(5)

0 commit comments

Comments
 (0)