diff --git a/scripts/import_packages.py b/scripts/import_packages.py index e96d65e1..9d9241b3 100644 --- a/scripts/import_packages.py +++ b/scripts/import_packages.py @@ -7,6 +7,7 @@ import numpy as np import sqlite_vec_sl_tmp +from codegate.config import Config from codegate.inference.inference_engine import LlamaCppInferenceEngine from codegate.utils.utils import generate_vector_string @@ -55,7 +56,9 @@ def setup_schema(self): async def process_package(self, package): vector_str = generate_vector_string(package) - vector = await self.inference_engine.embed(self.model_path, [vector_str]) + vector = await self.inference_engine.embed( + self.model_path, [vector_str], n_gpu_layers=Config.get_config().chat_model_n_gpu_layers + ) vector_array = np.array(vector[0], dtype=np.float32) cursor = self.conn.cursor() diff --git a/src/codegate/inference/inference_engine.py b/src/codegate/inference/inference_engine.py index 9433a345..6e0a6e50 100644 --- a/src/codegate/inference/inference_engine.py +++ b/src/codegate/inference/inference_engine.py @@ -1,5 +1,13 @@ +from typing import Iterator, List, Union + import structlog -from llama_cpp import Llama +from llama_cpp import ( + CreateChatCompletionResponse, + CreateChatCompletionStreamResponse, + CreateCompletionResponse, + CreateCompletionStreamResponse, + Llama, +) logger = structlog.get_logger("codegate") @@ -35,7 +43,9 @@ def _close_models(self): model._sampler.close() model.close() - async def __get_model(self, model_path, embedding=False, n_ctx=512, n_gpu_layers=0) -> Llama: + async def __get_model( + self, model_path: str, embedding: bool = False, n_ctx: int = 512, n_gpu_layers: int = 0 + ) -> Llama: """ Returns Llama model object from __models if present. Otherwise, the model is loaded and added to __models and returned. @@ -55,7 +65,9 @@ async def __get_model(self, model_path, embedding=False, n_ctx=512, n_gpu_layers return self.__models[model_path] - async def complete(self, model_path, n_ctx=512, n_gpu_layers=0, **completion_request): + async def complete( + self, model_path: str, n_ctx: int = 512, n_gpu_layers: int = 0, **completion_request + ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]: """ Generates a chat completion using the specified model and request parameters. """ @@ -64,7 +76,9 @@ async def complete(self, model_path, n_ctx=512, n_gpu_layers=0, **completion_req ) return model.create_completion(**completion_request) - async def chat(self, model_path, n_ctx=512, n_gpu_layers=0, **chat_completion_request): + async def chat( + self, model_path: str, n_ctx: int = 512, n_gpu_layers: int = 0, **chat_completion_request + ) -> Union[CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]]: """ Generates a chat completion using the specified model and request parameters. """ @@ -73,18 +87,20 @@ async def chat(self, model_path, n_ctx=512, n_gpu_layers=0, **chat_completion_re ) return model.create_chat_completion(**chat_completion_request) - async def embed(self, model_path, content): + async def embed(self, model_path: str, content: List[str], n_gpu_layers=0) -> List[List[float]]: """ Generates an embedding for the given content using the specified model. """ logger.debug( "Generating embedding", model=model_path.split("/")[-1], - content=content, + content=content[0][0 : min(100, len(content[0]))], content_length=len(content[0]) if content else 0, ) - model = await self.__get_model(model_path=model_path, embedding=True) + model = await self.__get_model( + model_path=model_path, embedding=True, n_gpu_layers=n_gpu_layers + ) embedding = model.embed(content) logger.debug( diff --git a/src/codegate/pipeline/suspicious_commands/suspicious_commands.py b/src/codegate/pipeline/suspicious_commands/suspicious_commands.py index 7c61454a..1670e010 100644 --- a/src/codegate/pipeline/suspicious_commands/suspicious_commands.py +++ b/src/codegate/pipeline/suspicious_commands/suspicious_commands.py @@ -80,7 +80,9 @@ async def compute_embeddings(self, phrases): Returns: torch.Tensor: Tensor of embeddings. """ - embeddings = await self.inference_engine.embed(self.model_path, phrases) + embeddings = await self.inference_engine.embed( + self.model_path, phrases, n_gpu_layers=Config.get_config().chat_model_n_gpu_layers + ) return embeddings async def classify_phrase(self, phrase, embeddings=None): diff --git a/src/codegate/pipeline/suspicious_commands/suspicious_commands_trainer.py b/src/codegate/pipeline/suspicious_commands/suspicious_commands_trainer.py index d31e981d..5b8c71f0 100644 --- a/src/codegate/pipeline/suspicious_commands/suspicious_commands_trainer.py +++ b/src/codegate/pipeline/suspicious_commands/suspicious_commands_trainer.py @@ -107,7 +107,9 @@ async def train(self, phrases, labels): phrases (list of str): List of phrases to train on. labels (list of int): Corresponding labels for the phrases. """ - embeds = await self.inference_engine.embed(self.model_path, phrases) + embeds = await self.inference_engine.embed( + self.model_path, phrases, n_gpu_layers=Config.get_config().chat_model_n_gpu_layers + ) if isinstance(embeds[0], list): embedding_dim = len(embeds[0]) else: diff --git a/src/codegate/storage/storage_engine.py b/src/codegate/storage/storage_engine.py index 9543fe70..f02cf72b 100644 --- a/src/codegate/storage/storage_engine.py +++ b/src/codegate/storage/storage_engine.py @@ -185,7 +185,11 @@ async def search( elif query: # Generate embedding for the query - query_vector = await self.inference_engine.embed(self.model_path, [query]) + query_vector = await self.inference_engine.embed( + self.model_path, + [query], + n_gpu_layers=Config.get_config().chat_model_n_gpu_layers, + ) query_embedding = np.array(query_vector[0], dtype=np.float32) query_embedding_bytes = query_embedding.tobytes()