@@ -303,10 +303,13 @@ def __init__(
303
303
self .query_start_loc = self ._make_buffer (self .max_num_reqs + 1 ,
304
304
dtype = torch .int32 )
305
305
self .seq_lens = self ._make_buffer (self .max_num_reqs , dtype = torch .int32 )
306
- self .inputs_embeds = torch .zeros (
307
- (self .max_num_tokens , self .hidden_size ),
308
- dtype = self .dtype ,
309
- device = self .device )
306
+ # Because inputs_embeds may be bfloat16 and we don't need a numpy
307
+ # version of this tensor, avoid a RuntimeError by not creating a
308
+ # numpy buffer.
309
+ self .inputs_embeds = self ._make_buffer (self .max_num_tokens ,
310
+ self .hidden_size ,
311
+ dtype = self .dtype ,
312
+ numpy = False )
310
313
311
314
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
312
315
if self .uses_mrope :
@@ -374,11 +377,18 @@ def __init__(
374
377
device = "cpu" ,
375
378
pin_memory = self .pin_memory )
376
379
377
- def _make_buffer (self , * args , dtype : torch .dtype ) -> CpuGpuBuffer :
378
- return CpuGpuBuffer (* args ,
380
+ def _make_buffer (self ,
381
+ * size : Union [int , torch .SymInt ],
382
+ dtype : torch .dtype ,
383
+ numpy : bool = True ) -> CpuGpuBuffer :
384
+ # Bfloat16 torch tensors cannot be directly cast to a numpy array, so
385
+ # if a bfloat16 buffer is needed without a corresponding numpy array,
386
+ # don't bother instantiating the numpy array.
387
+ return CpuGpuBuffer (* size ,
379
388
dtype = dtype ,
380
389
device = self .device ,
381
- pin_memory = self .pin_memory )
390
+ pin_memory = self .pin_memory ,
391
+ with_numpy = numpy )
382
392
383
393
def _init_model_kwargs (self , num_tokens : int ):
384
394
model_kwargs = dict [str , Any ]()
@@ -1645,11 +1655,11 @@ def execute_model(
1645
1655
)
1646
1656
1647
1657
# TODO(woosuk): Avoid the copy. Optimize.
1648
- self .inputs_embeds [:num_scheduled_tokens ].copy_ (
1658
+ self .inputs_embeds . gpu [:num_scheduled_tokens ].copy_ (
1649
1659
inputs_embeds_scheduled )
1650
1660
1651
1661
input_ids = None
1652
- inputs_embeds = self .inputs_embeds [:num_input_tokens ]
1662
+ inputs_embeds = self .inputs_embeds . gpu [:num_input_tokens ]
1653
1663
model_kwargs = {
1654
1664
** self ._init_model_kwargs (num_scheduled_tokens ),
1655
1665
** self ._extract_mm_kwargs (scheduler_output ),
@@ -2484,7 +2494,7 @@ def _dummy_run(
2484
2494
num_scheduled_tokens , remove_lora ):
2485
2495
if self .supports_mm_inputs :
2486
2496
input_ids = None
2487
- inputs_embeds = self .inputs_embeds [:num_tokens ]
2497
+ inputs_embeds = self .inputs_embeds . gpu [:num_tokens ]
2488
2498
model_kwargs = {
2489
2499
** self ._init_model_kwargs (num_tokens ),
2490
2500
** self ._dummy_mm_kwargs (num_reqs ),
0 commit comments