Refact: Limit Vector Database Metadata Size #2240

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

danielaskdd merged 29 commits into HKUDS:main from danielaskdd:limit-vdb-metadata-size

Oct 21, 2025

env.example

-Original file line number
+Diff line change
@@ Expand Up / @@ -74,11 +74,6 @@ ENABLE_LLM_CACHE=true @@
     ### control the maximum tokens send to LLM (include entities, relations and chunks)
     # MAX_TOTAL_TOKENS=30000
-    ### maximum number of related chunks per source entity or relation
-    ###     The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
-    ###     Higher values increase re-ranking time
-    # RELATED_CHUNK_NUMBER=5
     ### chunk selection strategies
     ###     VECTOR: Pick KG chunks by vector similarity, delivered chunks to the LLM aligning more closely with naive retrieval
     ###     WEIGHT: Pick KG chunks by entity and chunk weight, delivered more solely KG related chunks to the LLM
@@ Expand Down Expand Up / @@ -140,6 +135,22 @@ SUMMARY_LANGUAGE=English @@
     ### Maximum context size sent to LLM for description summary
     # SUMMARY_CONTEXT_SIZE=12000
+    ### control the maximum chunk_ids stored in vector and graph db
+    # MAX_SOURCE_IDS_PER_ENTITY=300
+    # MAX_SOURCE_IDS_PER_RELATION=300
+    ### control chunk_ids limitation method: FIFO, FIFO
+    ###    FIFO: First in first out
+    ###    KEEP: Keep oldest (less merge action and faster)
+    # SOURCE_IDS_LIMIT_METHOD=FIFO
+    # Maximum number of file paths stored in entity/relation file_path field (For displayed only, does not affect query performance)
+    # MAX_FILE_PATHS=100
+    ### maximum number of related chunks per source entity or relation
+    ###     The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
+    ###     Higher values increase re-ranking time
+    # RELATED_CHUNK_NUMBER=5
     ###############################
     ### Concurrency Configuration
     ###############################
@@ Expand Down @@

lightrag/api/__init__.py

Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__api_version__ = "0241"
		__api_version__ = "0242"

lightrag/api/routers/document_routes.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -2003,6 +2003,8 @@ async def clear_documents(): @@
                     rag.full_docs,
                     rag.full_entities,
                     rag.full_relations,
+                    rag.entity_chunks,
+                    rag.relation_chunks,
                     rag.entities_vdb,
                     rag.relationships_vdb,
                     rag.chunks_vdb,
@@ Expand Down @@

lightrag/base.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -355,6 +355,14 @@ async def delete(self, ids: list[str]) -> None: @@
                 None
             """
+        @abstractmethod
+        async def is_empty(self) -> bool:
+            """Check if the storage is empty
+            Returns:
+                bool: True if storage contains no data, False otherwise
+            """
     @dataclass
     class BaseGraphStorage(StorageNameSpace, ABC):
@@ Expand Down @@

lightrag/constants.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -57,8 +57,27 @@ @@
     DEFAULT_MIN_RERANK_SCORE = 0.0
     DEFAULT_RERANK_BINDING = "null"
-    # File path configuration for vector and graph database(Should not be changed, used in Milvus Schema)
+    # Default source ids limit in meta data for entity and relation
+    DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 300
+    DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 300
+    ### control chunk_ids limitation method: FIFO, FIFO
+    ###    FIFO: First in first out
+    ###    KEEP: Keep oldest (less merge action and faster)
+    SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP"
+    SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO"
+    DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_FIFO
+    VALID_SOURCE_IDS_LIMIT_METHODS = {
+        SOURCE_IDS_LIMIT_METHOD_KEEP,
+        SOURCE_IDS_LIMIT_METHOD_FIFO,
+    }
+    # Maximum number of file paths stored in entity/relation file_path field (For displayed only, does not affect query performance)
+    DEFAULT_MAX_FILE_PATHS = 100
+    # Field length of file_path in Milvus Schema for entity and relation (Should not be changed)
+    # file_path must store all file paths up to the DEFAULT_MAX_FILE_PATHS limit within the metadata.
     DEFAULT_MAX_FILE_PATH_LENGTH = 32768
+    # Placeholder for more file paths in meta data for entity and relation (Should not be changed)
+    DEFAULT_FILE_PATH_MORE_PLACEHOLDER = "truncated"
     # Default temperature for LLM
     DEFAULT_TEMPERATURE = 1.0
@@ Expand Down @@

lightrag/kg/json_doc_status_impl.py

-Original file line number
+Diff line change
@@ Expand Up @@
             await self.index_done_callback()
+        async def is_empty(self) -> bool:
+            """Check if the storage is empty
+            Returns:
+                bool: True if storage is empty, False otherwise
+            Raises:
+                StorageNotInitializedError: If storage is not initialized
+            """
+            if self._storage_lock is None:
+                raise StorageNotInitializedError("JsonDocStatusStorage")
+            async with self._storage_lock:
+                return len(self._data) == 0
         async def get_by_id(self, id: str) -> Union[dict[str, Any], None]:
             async with self._storage_lock:
                 return self._data.get(id)
@@ Expand Down @@

lightrag/kg/json_kv_impl.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -84,26 +84,6 @@ async def index_done_callback(self) -> None: @@
                     write_json(data_dict, self._file_name)
                     await clear_all_update_flags(self.final_namespace)
-        async def get_all(self) -> dict[str, Any]:
-            """Get all data from storage
-            Returns:
-                Dictionary containing all stored data
-            """
-            async with self._storage_lock:
-                result = {}
-                for key, value in self._data.items():
-                    if value:
-                        # Create a copy to avoid modifying the original data
-                        data = dict(value)
-                        # Ensure time fields are present, provide default values for old data
-                        data.setdefault("create_time", 0)
-                        data.setdefault("update_time", 0)
-                        result[key] = data
-                    else:
-                        result[key] = value
-                return result
         async def get_by_id(self, id: str) -> dict[str, Any] | None:
             async with self._storage_lock:
                 result = self._data.get(id)
@@ Expand Down Expand Up / @@ -200,6 +180,15 @@ async def delete(self, ids: list[str]) -> None: @@
                 if any_deleted:
                     await set_all_update_flags(self.final_namespace)
+        async def is_empty(self) -> bool:
+            """Check if the storage is empty
+            Returns:
+                bool: True if storage contains no data, False otherwise
+            """
+            async with self._storage_lock:
+                return len(self._data) == 0
         async def drop(self) -> dict[str, str]:
             """Drop all data from storage and clean up resources
                This action will persistent the data to disk immediately.
@@ Expand Down @@

lightrag/kg/mongo_impl.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -175,22 +175,6 @@ async def filter_keys(self, keys: set[str]) -> set[str]: @@
             existing_ids = {str(x["_id"]) async for x in cursor}
             return keys - existing_ids
-        async def get_all(self) -> dict[str, Any]:
-            """Get all data from storage
-            Returns:
-                Dictionary containing all stored data
-            """
-            cursor = self._data.find({})
-            result = {}
-            async for doc in cursor:
-                doc_id = doc.pop("_id")
-                # Ensure time fields are present for all documents
-                doc.setdefault("create_time", 0)
-                doc.setdefault("update_time", 0)
-                result[doc_id] = doc
-            return result
         async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
             logger.debug(f"[{self.workspace}] Inserting {len(data)} to {self.namespace}")
             if not data:
@@ Expand Down Expand Up / @@ -236,6 +220,20 @@ async def index_done_callback(self) -> None: @@
             # Mongo handles persistence automatically
             pass
+        async def is_empty(self) -> bool:
+            """Check if the storage is empty for the current workspace and namespace
+            Returns:
+                bool: True if storage is empty, False otherwise
+            """
+            try:
+                # Use count_documents with limit 1 for efficiency
+                count = await self._data.count_documents({}, limit=1)
+                return count == 0
+            except PyMongoError as e:
+                logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}")
+                return True
         async def delete(self, ids: list[str]) -> None:
             """Delete documents with specified IDs
@@ Expand Down Expand Up / @@ -466,6 +464,20 @@ async def index_done_callback(self) -> None: @@
             # Mongo handles persistence automatically
             pass
+        async def is_empty(self) -> bool:
+            """Check if the storage is empty for the current workspace and namespace
+            Returns:
+                bool: True if storage is empty, False otherwise
+            """
+            try:
+                # Use count_documents with limit 1 for efficiency
+                count = await self._data.count_documents({}, limit=1)
+                return count == 0
+            except PyMongoError as e:
+                logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}")
+                return True
         async def drop(self) -> dict[str, str]:
             """Drop the storage by removing all documents in the collection.
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Refact: Limit Vector Database Metadata Size #2240

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!