From 8452099a5b1f167526cb26416a01fd7613f6b049 Mon Sep 17 00:00:00 2001 From: Dan Brown Date: Mon, 24 Mar 2025 16:28:14 +0000 Subject: [PATCH 1/3] Vectors: Built content vector indexing system --- app/Config/services.php | 10 +++ app/Search/SearchIndex.php | 14 +++- app/Search/Vectors/EntityVectorGenerator.php | 84 +++++++++++++++++++ app/Search/Vectors/SearchVector.php | 16 ++++ .../Services/OpenAiVectorQueryService.php | 36 ++++++++ .../Vectors/Services/VectorQueryService.php | 12 +++ app/Search/Vectors/StoreEntityVectorsJob.php | 28 +++++++ .../Vectors/VectorQueryServiceProvider.php | 38 +++++++++ ..._24_155748_create_search_vectors_table.php | 32 +++++++ 9 files changed, 269 insertions(+), 1 deletion(-) create mode 100644 app/Search/Vectors/EntityVectorGenerator.php create mode 100644 app/Search/Vectors/SearchVector.php create mode 100644 app/Search/Vectors/Services/OpenAiVectorQueryService.php create mode 100644 app/Search/Vectors/Services/VectorQueryService.php create mode 100644 app/Search/Vectors/StoreEntityVectorsJob.php create mode 100644 app/Search/Vectors/VectorQueryServiceProvider.php create mode 100644 database/migrations/2025_03_24_155748_create_search_vectors_table.php diff --git a/app/Config/services.php b/app/Config/services.php index d7345823150..a34b243f07d 100644 --- a/app/Config/services.php +++ b/app/Config/services.php @@ -22,6 +22,16 @@ // Callback URL for social authentication methods 'callback_url' => env('APP_URL', false), + // LLM Service + // Options: openai + 'llm' => env('LLM_SERVICE', ''), + + // OpenAI API-compatible service details + 'openai' => [ + 'endpoint' => env('OPENAI_ENDPOINT', 'https://api.openai.com'), + 'key' => env('OPENAI_KEY', ''), + ], + 'github' => [ 'client_id' => env('GITHUB_APP_ID', false), 'client_secret' => env('GITHUB_APP_SECRET', false), diff --git a/app/Search/SearchIndex.php b/app/Search/SearchIndex.php index 36f71f6ccc7..9b34fa04e28 100644 --- a/app/Search/SearchIndex.php +++ b/app/Search/SearchIndex.php @@ -6,6 +6,8 @@ use BookStack\Entities\EntityProvider; use BookStack\Entities\Models\Entity; use BookStack\Entities\Models\Page; +use BookStack\Search\Vectors\StoreEntityVectorsJob; +use BookStack\Search\Vectors\VectorQueryServiceProvider; use BookStack\Util\HtmlDocument; use DOMNode; use Illuminate\Database\Eloquent\Builder; @@ -25,7 +27,7 @@ class SearchIndex public static string $softDelimiters = ".-"; public function __construct( - protected EntityProvider $entityProvider + protected EntityProvider $entityProvider, ) { } @@ -37,6 +39,10 @@ public function indexEntity(Entity $entity): void $this->deleteEntityTerms($entity); $terms = $this->entityToTermDataArray($entity); $this->insertTerms($terms); + + if (VectorQueryServiceProvider::isEnabled()) { + dispatch(new StoreEntityVectorsJob($entity)); + } } /** @@ -47,9 +53,15 @@ public function indexEntity(Entity $entity): void public function indexEntities(array $entities): void { $terms = []; + $vectorQueryEnabled = VectorQueryServiceProvider::isEnabled(); + foreach ($entities as $entity) { $entityTerms = $this->entityToTermDataArray($entity); array_push($terms, ...$entityTerms); + + if ($vectorQueryEnabled) { + dispatch(new StoreEntityVectorsJob($entity)); + } } $this->insertTerms($terms); diff --git a/app/Search/Vectors/EntityVectorGenerator.php b/app/Search/Vectors/EntityVectorGenerator.php new file mode 100644 index 00000000000..8a49187736b --- /dev/null +++ b/app/Search/Vectors/EntityVectorGenerator.php @@ -0,0 +1,84 @@ +vectorQueryServiceProvider->get(); + + $text = $this->entityToPlainText($entity); + $chunks = $this->chunkText($text); + $embeddings = $this->chunksToEmbeddings($chunks, $vectorService); + + $this->deleteExistingEmbeddingsForEntity($entity); + $this->storeEmbeddings($embeddings, $chunks, $entity); + } + + protected function deleteExistingEmbeddingsForEntity(Entity $entity): void + { + SearchVector::query() + ->where('entity_type', '=', $entity->getMorphClass()) + ->where('entity_id', '=', $entity->id) + ->delete(); + } + + protected function storeEmbeddings(array $embeddings, array $textChunks, Entity $entity): void + { + $toInsert = []; + + foreach ($embeddings as $index => $embedding) { + $text = $textChunks[$index]; + $toInsert[] = [ + 'entity_id' => $entity->id, + 'entity_type' => $entity->getMorphClass(), + 'embedding' => DB::raw('STRING_TO_VECTOR("[' . implode(',', $embedding) . ']")'), + 'text' => $text, + ]; + } + + // TODO - Chunk inserts + SearchVector::query()->insert($toInsert); + } + + /** + * @param string[] $chunks + * @return float[] array + */ + protected function chunksToEmbeddings(array $chunks, VectorQueryService $vectorQueryService): array + { + $embeddings = []; + foreach ($chunks as $index => $chunk) { + $embeddings[$index] = $vectorQueryService->generateEmbeddings($chunk); + } + return $embeddings; + } + + /** + * @return string[] + */ + protected function chunkText(string $text): array + { + // TODO - Join adjacent smaller chunks up + return array_filter(array_map(function (string $section): string { + return trim($section); + }, explode("\n", $text))); + } + + protected function entityToPlainText(Entity $entity): string + { + $text = $entity->name . "\n\n" . $entity->{$entity->textField}; + // TODO - Add tags + return $text; + } +} diff --git a/app/Search/Vectors/SearchVector.php b/app/Search/Vectors/SearchVector.php new file mode 100644 index 00000000000..4a5555f87d9 --- /dev/null +++ b/app/Search/Vectors/SearchVector.php @@ -0,0 +1,16 @@ +endpoint, '/') . '/' . ltrim($uri, '/'); + $client = $this->http->buildClient(10); + $request = $this->http->jsonRequest($method, $fullUrl, $data) + ->withHeader('Authorization', 'Bearer ' . $this->key); + + $response = $client->sendRequest($request); + return json_decode($response->getBody()->getContents(), true); + } + + public function generateEmbeddings(string $text): array + { + $response = $this->jsonRequest('POST', 'v1/embeddings', [ + 'input' => $text, + 'model' => 'text-embedding-3-small', + ]); + + return $response['data'][0]['embedding']; + } +} diff --git a/app/Search/Vectors/Services/VectorQueryService.php b/app/Search/Vectors/Services/VectorQueryService.php new file mode 100644 index 00000000000..2cc4ed0178f --- /dev/null +++ b/app/Search/Vectors/Services/VectorQueryService.php @@ -0,0 +1,12 @@ +generateAndStore($this->entity); + } +} diff --git a/app/Search/Vectors/VectorQueryServiceProvider.php b/app/Search/Vectors/VectorQueryServiceProvider.php new file mode 100644 index 00000000000..c700307e1f3 --- /dev/null +++ b/app/Search/Vectors/VectorQueryServiceProvider.php @@ -0,0 +1,38 @@ +getServiceName(); + + if ($service === 'openai') { + $key = config('services.openai.key'); + $endpoint = config('services.openai.endpoint'); + return new OpenAiVectorQueryService($endpoint, $key, $this->http); + } + + throw new \Exception("No '{$service}' LLM service found"); + } + + protected static function getServiceName(): string + { + return strtolower(config('services.llm')); + } + + public static function isEnabled(): bool + { + return !empty(static::getServiceName()); + } +} diff --git a/database/migrations/2025_03_24_155748_create_search_vectors_table.php b/database/migrations/2025_03_24_155748_create_search_vectors_table.php new file mode 100644 index 00000000000..d7fb0118a2f --- /dev/null +++ b/database/migrations/2025_03_24_155748_create_search_vectors_table.php @@ -0,0 +1,32 @@ +string('entity_type', 100); + $table->integer('entity_id'); + $table->text('text'); + $table->vector('embedding'); + + $table->index(['entity_type', 'entity_id']); + }); + } + + /** + * Reverse the migrations. + */ + public function down(): void + { + Schema::dropIfExists('search_vectors'); + } +}; From 0ffcb3d4aa895c1a3bfa8f9a14338e901b3de161 Mon Sep 17 00:00:00 2001 From: Dan Brown Date: Mon, 24 Mar 2025 19:51:48 +0000 Subject: [PATCH 2/3] Vectors: Got basic LLM querying working using vector search context --- app/Search/SearchController.php | 16 +++++++++ app/Search/Vectors/EntityVectorGenerator.php | 2 +- .../Services/OpenAiVectorQueryService.php | 21 ++++++++++++ .../Vectors/Services/VectorQueryService.php | 9 +++++ app/Search/Vectors/VectorSearchRunner.php | 33 +++++++++++++++++++ ..._24_155748_create_search_vectors_table.php | 5 ++- resources/views/search/query.blade.php | 29 ++++++++++++++++ routes/web.php | 1 + 8 files changed, 114 insertions(+), 2 deletions(-) create mode 100644 app/Search/Vectors/VectorSearchRunner.php create mode 100644 resources/views/search/query.blade.php diff --git a/app/Search/SearchController.php b/app/Search/SearchController.php index 2fce6a3d53f..a688385e7c3 100644 --- a/app/Search/SearchController.php +++ b/app/Search/SearchController.php @@ -6,6 +6,7 @@ use BookStack\Entities\Queries\QueryPopular; use BookStack\Entities\Tools\SiblingFetcher; use BookStack\Http\Controller; +use BookStack\Search\Vectors\VectorSearchRunner; use Illuminate\Http\Request; class SearchController extends Controller @@ -139,4 +140,19 @@ public function searchSiblings(Request $request, SiblingFetcher $siblingFetcher) return view('entities.list-basic', ['entities' => $entities, 'style' => 'compact']); } + + public function searchQuery(Request $request, VectorSearchRunner $runner) + { + $query = $request->get('query', ''); + + if ($query) { + $results = $runner->run($query); + } else { + $results = null; + } + + return view('search.query', [ + 'results' => $results, + ]); + } } diff --git a/app/Search/Vectors/EntityVectorGenerator.php b/app/Search/Vectors/EntityVectorGenerator.php index 8a49187736b..9563694a321 100644 --- a/app/Search/Vectors/EntityVectorGenerator.php +++ b/app/Search/Vectors/EntityVectorGenerator.php @@ -42,7 +42,7 @@ protected function storeEmbeddings(array $embeddings, array $textChunks, Entity $toInsert[] = [ 'entity_id' => $entity->id, 'entity_type' => $entity->getMorphClass(), - 'embedding' => DB::raw('STRING_TO_VECTOR("[' . implode(',', $embedding) . ']")'), + 'embedding' => DB::raw('VEC_FROMTEXT("[' . implode(',', $embedding) . ']")'), 'text' => $text, ]; } diff --git a/app/Search/Vectors/Services/OpenAiVectorQueryService.php b/app/Search/Vectors/Services/OpenAiVectorQueryService.php index 8d291099846..e0e145f3ad7 100644 --- a/app/Search/Vectors/Services/OpenAiVectorQueryService.php +++ b/app/Search/Vectors/Services/OpenAiVectorQueryService.php @@ -33,4 +33,25 @@ public function generateEmbeddings(string $text): array return $response['data'][0]['embedding']; } + + public function query(string $input, array $context): string + { + $formattedContext = implode("\n", $context); + + $response = $this->jsonRequest('POST', 'v1/chat/completions', [ + 'model' => 'gpt-4o', + 'messages' => [ + [ + 'role' => 'developer', + 'content' => 'You are a helpful assistant providing search query responses. Be specific, factual and to-the-point in response.' + ], + [ + 'role' => 'user', + 'content' => "Provide a response to the below given QUERY using the below given CONTEXT\nQUERY: {$input}\n\nCONTEXT: {$formattedContext}", + ] + ], + ]); + + return $response['choices'][0]['message']['content'] ?? ''; + } } diff --git a/app/Search/Vectors/Services/VectorQueryService.php b/app/Search/Vectors/Services/VectorQueryService.php index 2cc4ed0178f..746f95f5b22 100644 --- a/app/Search/Vectors/Services/VectorQueryService.php +++ b/app/Search/Vectors/Services/VectorQueryService.php @@ -9,4 +9,13 @@ interface VectorQueryService * @return float[] */ public function generateEmbeddings(string $text): array; + + /** + * Query the LLM service using the given user input, and + * relevant context text retrieved locally via a vector search. + * Returns the response output text from the LLM. + * + * @param string[] $context + */ + public function query(string $input, array $context): string; } diff --git a/app/Search/Vectors/VectorSearchRunner.php b/app/Search/Vectors/VectorSearchRunner.php new file mode 100644 index 00000000000..db28779e403 --- /dev/null +++ b/app/Search/Vectors/VectorSearchRunner.php @@ -0,0 +1,33 @@ +vectorQueryServiceProvider->get(); + $queryVector = $queryService->generateEmbeddings($query); + + // TODO - Apply permissions + // TODO - Join models + $topMatches = SearchVector::query()->select('text', 'entity_type', 'entity_id') + ->selectRaw('VEC_DISTANCE_COSINE(VEC_FROMTEXT("[' . implode(',', $queryVector) . ']"), embedding) as distance') + ->orderBy('distance', 'asc') + ->limit(10) + ->get(); + + $matchesText = array_values(array_map(fn (SearchVector $match) => $match->text, $topMatches->all())); + $llmResult = $queryService->query($query, $matchesText); + + return [ + 'llm_result' => $llmResult, + 'entity_matches' => $topMatches->toArray() + ]; + } +} diff --git a/database/migrations/2025_03_24_155748_create_search_vectors_table.php b/database/migrations/2025_03_24_155748_create_search_vectors_table.php index d7fb0118a2f..1b552b22c9a 100644 --- a/database/migrations/2025_03_24_155748_create_search_vectors_table.php +++ b/database/migrations/2025_03_24_155748_create_search_vectors_table.php @@ -16,10 +16,13 @@ public function up(): void $table->string('entity_type', 100); $table->integer('entity_id'); $table->text('text'); - $table->vector('embedding'); $table->index(['entity_type', 'entity_id']); }); + + $table = DB::getTablePrefix() . 'search_vectors'; + DB::statement("ALTER TABLE {$table} ADD COLUMN (embedding VECTOR(1536) NOT NULL)"); + DB::statement("ALTER TABLE {$table} ADD VECTOR INDEX (embedding) DISTANCE=cosine"); } /** diff --git a/resources/views/search/query.blade.php b/resources/views/search/query.blade.php new file mode 100644 index 00000000000..e8b4c84779c --- /dev/null +++ b/resources/views/search/query.blade.php @@ -0,0 +1,29 @@ +@extends('layouts.simple') + +@section('body') +
+ +
+ + +
+ + @if($results) +

Results

+ +

LLM Output

+

{{ $results['llm_result'] }}

+ +

Entity Matches

+ @foreach($results['entity_matches'] as $match) +
+
{{ $match['entity_type'] }}:{{ $match['entity_id'] }}; Distance: {{ $match['distance'] }}
+
+ match text +
{{ $match['text'] }}
+
+
+ @endforeach + @endif +
+@stop diff --git a/routes/web.php b/routes/web.php index 8184725834c..15fe6d69b2d 100644 --- a/routes/web.php +++ b/routes/web.php @@ -187,6 +187,7 @@ // Search Route::get('/search', [SearchController::class, 'search']); + Route::get('/search/query', [SearchController::class, 'searchQuery']); Route::get('/search/book/{bookId}', [SearchController::class, 'searchBook']); Route::get('/search/chapter/{bookId}', [SearchController::class, 'searchChapter']); Route::get('/search/entity/siblings', [SearchController::class, 'searchSiblings']); From a023bed41d0219d08d7dbce52948e3e5c3528381 Mon Sep 17 00:00:00 2001 From: Dan Brown Date: Tue, 25 Mar 2025 19:38:32 +0000 Subject: [PATCH 3/3] Vectors: Added command to regenerate for all Also made models configurable. Tested system scales via 86k vector entries. --- app/Config/services.php | 2 + .../Commands/RegenerateVectorsCommand.php | 46 +++++++++++++++++++ .../Services/OpenAiVectorQueryService.php | 23 +++++++--- .../Vectors/VectorQueryServiceProvider.php | 4 +- app/Search/Vectors/VectorSearchRunner.php | 1 + ..._24_155748_create_search_vectors_table.php | 2 + 6 files changed, 68 insertions(+), 10 deletions(-) create mode 100644 app/Console/Commands/RegenerateVectorsCommand.php diff --git a/app/Config/services.php b/app/Config/services.php index a34b243f07d..aafe0bacc99 100644 --- a/app/Config/services.php +++ b/app/Config/services.php @@ -30,6 +30,8 @@ 'openai' => [ 'endpoint' => env('OPENAI_ENDPOINT', 'https://api.openai.com'), 'key' => env('OPENAI_KEY', ''), + 'embedding_model' => env('OPENAI_EMBEDDING_MODEL', 'text-embedding-3-small'), + 'query_model' => env('OPENAI_QUERY_MODEL', 'gpt-4o'), ], 'github' => [ diff --git a/app/Console/Commands/RegenerateVectorsCommand.php b/app/Console/Commands/RegenerateVectorsCommand.php new file mode 100644 index 00000000000..700d05300d8 --- /dev/null +++ b/app/Console/Commands/RegenerateVectorsCommand.php @@ -0,0 +1,46 @@ +delete(); + + $types = $entityProvider->all(); + foreach ($types as $type => $typeInstance) { + $this->info("Creating jobs to store vectors for {$type} data..."); + /** @var Entity[] $entities */ + $typeInstance->newQuery()->chunkById(100, function ($entities) { + foreach ($entities as $entity) { + dispatch(new StoreEntityVectorsJob($entity)); + } + }); + } + } +} diff --git a/app/Search/Vectors/Services/OpenAiVectorQueryService.php b/app/Search/Vectors/Services/OpenAiVectorQueryService.php index e0e145f3ad7..fea4d5c1445 100644 --- a/app/Search/Vectors/Services/OpenAiVectorQueryService.php +++ b/app/Search/Vectors/Services/OpenAiVectorQueryService.php @@ -6,17 +6,26 @@ class OpenAiVectorQueryService implements VectorQueryService { + protected string $key; + protected string $endpoint; + protected string $embeddingModel; + protected string $queryModel; + public function __construct( - protected string $endpoint, - protected string $key, + protected array $options, protected HttpRequestService $http, ) { + // TODO - Some kind of validation of options + $this->key = $this->options['key'] ?? ''; + $this->endpoint = $this->options['endpoint'] ?? ''; + $this->embeddingModel = $this->options['embedding_model'] ?? ''; + $this->queryModel = $this->options['query_model'] ?? ''; } protected function jsonRequest(string $method, string $uri, array $data): array { $fullUrl = rtrim($this->endpoint, '/') . '/' . ltrim($uri, '/'); - $client = $this->http->buildClient(10); + $client = $this->http->buildClient(30); $request = $this->http->jsonRequest($method, $fullUrl, $data) ->withHeader('Authorization', 'Bearer ' . $this->key); @@ -28,7 +37,7 @@ public function generateEmbeddings(string $text): array { $response = $this->jsonRequest('POST', 'v1/embeddings', [ 'input' => $text, - 'model' => 'text-embedding-3-small', + 'model' => $this->embeddingModel, ]); return $response['data'][0]['embedding']; @@ -39,15 +48,15 @@ public function query(string $input, array $context): string $formattedContext = implode("\n", $context); $response = $this->jsonRequest('POST', 'v1/chat/completions', [ - 'model' => 'gpt-4o', + 'model' => $this->queryModel, 'messages' => [ [ 'role' => 'developer', - 'content' => 'You are a helpful assistant providing search query responses. Be specific, factual and to-the-point in response.' + 'content' => 'You are a helpful assistant providing search query responses. Be specific, factual and to-the-point in response. Don\'t try to converse or continue the conversation.' ], [ 'role' => 'user', - 'content' => "Provide a response to the below given QUERY using the below given CONTEXT\nQUERY: {$input}\n\nCONTEXT: {$formattedContext}", + 'content' => "Provide a response to the below given QUERY using the below given CONTEXT. The CONTEXT is split into parts via lines. Ignore any nonsensical lines of CONTEXT.\nQUERY: {$input}\n\nCONTEXT: {$formattedContext}", ] ], ]); diff --git a/app/Search/Vectors/VectorQueryServiceProvider.php b/app/Search/Vectors/VectorQueryServiceProvider.php index c700307e1f3..eae7149d03c 100644 --- a/app/Search/Vectors/VectorQueryServiceProvider.php +++ b/app/Search/Vectors/VectorQueryServiceProvider.php @@ -18,9 +18,7 @@ public function get(): VectorQueryService $service = $this->getServiceName(); if ($service === 'openai') { - $key = config('services.openai.key'); - $endpoint = config('services.openai.endpoint'); - return new OpenAiVectorQueryService($endpoint, $key, $this->http); + return new OpenAiVectorQueryService(config('services.openai'), $this->http); } throw new \Exception("No '{$service}' LLM service found"); diff --git a/app/Search/Vectors/VectorSearchRunner.php b/app/Search/Vectors/VectorSearchRunner.php index db28779e403..53b1a4bd696 100644 --- a/app/Search/Vectors/VectorSearchRunner.php +++ b/app/Search/Vectors/VectorSearchRunner.php @@ -19,6 +19,7 @@ public function run(string $query): array $topMatches = SearchVector::query()->select('text', 'entity_type', 'entity_id') ->selectRaw('VEC_DISTANCE_COSINE(VEC_FROMTEXT("[' . implode(',', $queryVector) . ']"), embedding) as distance') ->orderBy('distance', 'asc') + ->having('distance', '<', 0.6) ->limit(10) ->get(); diff --git a/database/migrations/2025_03_24_155748_create_search_vectors_table.php b/database/migrations/2025_03_24_155748_create_search_vectors_table.php index 1b552b22c9a..0ae67c2256f 100644 --- a/database/migrations/2025_03_24_155748_create_search_vectors_table.php +++ b/database/migrations/2025_03_24_155748_create_search_vectors_table.php @@ -21,6 +21,8 @@ public function up(): void }); $table = DB::getTablePrefix() . 'search_vectors'; + + // TODO - Vector size might need to be dynamic DB::statement("ALTER TABLE {$table} ADD COLUMN (embedding VECTOR(1536) NOT NULL)"); DB::statement("ALTER TABLE {$table} ADD VECTOR INDEX (embedding) DISTANCE=cosine"); }