diff --git a/tensorrt_llm/_torch/peft/lora/layer.py b/tensorrt_llm/_torch/peft/lora/layer.py index fb984614175..2c8bc5e2f58 100644 --- a/tensorrt_llm/_torch/peft/lora/layer.py +++ b/tensorrt_llm/_torch/peft/lora/layer.py @@ -107,8 +107,6 @@ def forward( module_idx = int(module_idx) if module_idx in lora_params[layer_idx]: active_lora_module_ids.append(module_idx) - # TODO (dafrimi): needs to pass this is_dora arg - lora_params[layer_idx][module_idx]['is_dora'] lora_ranks.append( lora_params[layer_idx][module_idx]['adapter_size']) lora_weight_pointers.append( diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index 8aa263bb039..67f63d8811b 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -317,7 +317,9 @@ def __init__( self.py_decoding_iter = 0 self.is_attention_dp_dummy = False self.is_cuda_graph_dummy = False - self.py_lora_task_layer_module_configs = None + self.py_lora_task_layer_module_configs: list[ + tensorrt_llm.bindings.internal.runtime. + TaskLayerModuleConfig] | None = None self.py_return_log_probs = return_log_probs self.py_return_context_logits = return_context_logits diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 2bbd97a3821..fe5d920d204 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -1971,7 +1971,6 @@ def _get_lora_params_from_requests(self, module_id: dict { adapter_size: torch tensor: int - is_dora: torch tensor: bool weight_pointers: torch tensor: int64 } } @@ -1990,88 +1989,63 @@ def _get_lora_params_from_requests(self, for module in request.py_lora_task_layer_module_configs: module_id = module.module_id layer_id = module.layer_id - adapter_size = module.adapter_size - is_dora = module.scaling_vec_pointer == 0 - weights_in_pointer = module.weights_in_pointer - weights_out_pointer = module.weights_out_pointer - scaling_vec_pointer = module.scaling_vec_pointer - if weights_in_pointer is None: - weights_in_pointer = 0 - if weights_out_pointer is None: - weights_out_pointer = 0 - if scaling_vec_pointer is None: - scaling_vec_pointer = 0 if layer_id not in lora_params: lora_params[layer_id] = {} if module_id not in lora_params[layer_id]: - lora_params[layer_id][module_id] = {} - - if 'adapter_size' not in lora_params[layer_id][module_id]: - lora_params[layer_id][module_id]['adapter_size'] = [] - if 'is_dora' not in lora_params[layer_id][module_id]: - lora_params[layer_id][module_id]['is_dora'] = [] - if 'weight_pointers' not in lora_params[layer_id][module_id]: - lora_params[layer_id][module_id]['weight_pointers'] = [] - - tmp_lora_params[ - f'{request.py_request_id}_{layer_id}_{module_id}_adapter_size'] = [ - adapter_size - ] - tmp_lora_params[ - f'{request.py_request_id}_{layer_id}_{module_id}_is_dora'] = [ - is_dora - ] - tmp_lora_params[ - f'{request.py_request_id}_{layer_id}_{module_id}_weights_pointer'] = [ - weights_in_pointer, weights_out_pointer, - scaling_vec_pointer - ] + lora_params[layer_id][module_id] = { + 'adapter_size': [], + 'weight_pointers': [], + } + + scaling_vec_pointer = module.scaling_vec_pointer + if scaling_vec_pointer is None: + scaling_vec_pointer = 0 + tmp_lora_params[(request.py_request_id, layer_id, + module_id)] = { + 'adapter_size': [module.adapter_size], + 'weight_pointers': [ + module.weights_in_pointer, + module.weights_out_pointer, + scaling_vec_pointer + ], + } for request in request_list: # Need to set default values for this case if request.py_lora_task_layer_module_configs is None: for layer_id in lora_params: for module_id in lora_params[layer_id]: - lora_params[layer_id][module_id]['adapter_size'].append( - 0) - lora_params[layer_id][module_id]['is_dora'].append( - False) - lora_params[layer_id][module_id]['weight_pointers'] += [ - 0, 0, 0 - ] + current_lora_params = lora_params[layer_id][module_id] + current_lora_params['adapter_size'].append(0) + current_lora_params['weight_pointers'] += [0, 0, 0] else: for layer_id in lora_params: for module_id in lora_params[layer_id]: - if f'{request.py_request_id}_{layer_id}_{module_id}_adapter_size' not in tmp_lora_params: - lora_params[layer_id][module_id][ - 'adapter_size'].append(0) - lora_params[layer_id][module_id]['is_dora'].append( - False) - lora_params[layer_id][module_id][ - 'weight_pointers'] += [0, 0, 0] + current_tmp_lora_params = tmp_lora_params.get( + (request.py_request_id, layer_id, module_id), None) + current_lora_params = lora_params[layer_id][module_id] + if current_tmp_lora_params is None: + current_lora_params['adapter_size'].append(0) + current_lora_params['weight_pointers'] += [0, 0, 0] else: - lora_params[layer_id][module_id][ - 'adapter_size'] += tmp_lora_params[ - f'{request.py_request_id}_{layer_id}_{module_id}_adapter_size'] - lora_params[layer_id][module_id][ - 'is_dora'] += tmp_lora_params[ - f'{request.py_request_id}_{layer_id}_{module_id}_is_dora'] - lora_params[layer_id][module_id][ - 'weight_pointers'] += tmp_lora_params[ - f'{request.py_request_id}_{layer_id}_{module_id}_weights_pointer'] + current_lora_params[ + 'adapter_size'] += current_tmp_lora_params[ + 'adapter_size'] + current_lora_params[ + 'weight_pointers'] += current_tmp_lora_params[ + 'weight_pointers'] for layer_id in lora_params: for module_id in lora_params[layer_id]: - lora_params[layer_id][module_id][ - 'adapter_size'] = torch.IntTensor( - lora_params[layer_id][module_id]['adapter_size']) - lora_params[layer_id][module_id][ - 'weight_pointers'] = torch.LongTensor( - lora_params[layer_id][module_id]['weight_pointers']) + current_lora_params = lora_params[layer_id][module_id] + current_lora_params['adapter_size'] = torch.IntTensor( + current_lora_params['adapter_size']) + current_lora_params['weight_pointers'] = torch.LongTensor( + current_lora_params['weight_pointers']) - if bool(lora_params): + if lora_params: lora_params['host_request_types'] = attn_metadata.host_request_types lora_params['prompt_lens_cpu'] = attn_metadata.prompt_lens_cpu lora_params['num_seqs'] = attn_metadata.num_seqs diff --git a/tests/unittest/_torch/modules/tests_lora_modules/test_lora_attention_pytorch_flow_vs_trt.py b/tests/unittest/_torch/modules/tests_lora_modules/test_lora_attention_pytorch_flow_vs_trt.py index 3998127a03f..180bd2842ca 100644 --- a/tests/unittest/_torch/modules/tests_lora_modules/test_lora_attention_pytorch_flow_vs_trt.py +++ b/tests/unittest/_torch/modules/tests_lora_modules/test_lora_attention_pytorch_flow_vs_trt.py @@ -419,31 +419,23 @@ def test_lora_attention(self): lora_params['lora_ranks'], 'weight_pointers': lora_params['lora_weights_pointers'], - 'is_dora': - False, }, LoraModuleType.ATTENTION_K: { 'adapter_size': lora_params['lora_ranks'], 'weight_pointers': lora_params['lora_weights_pointers'], - 'is_dora': - False, }, LoraModuleType.ATTENTION_V: { 'adapter_size': lora_params['lora_ranks'], 'weight_pointers': lora_params['lora_weights_pointers'], - 'is_dora': - False, }, LoraModuleType.ATTENTION_DENSE: { 'adapter_size': lora_params['lora_ranks'], 'weight_pointers': lora_params['lora_weights_pointers'], - 'is_dora': - False, } } }