Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
54f0a7d
Quick fix to limit source_id ballooning while inserting nodes
divineslight Oct 14, 2025
d52c337
Import from env and use default if none and removed useless import
haseebuchiha Oct 14, 2025
c06522b
Get max source Id config from .env and lightRAG init
divineslight Oct 15, 2025
9f49e56
Merge branch 'main' into feat-entity-size-caps
danielaskdd Oct 17, 2025
7871600
Quick fix to limit source_id ballooning while inserting nodes
divineslight Oct 14, 2025
4e740af
Import from env and use default if none and removed useless import
haseebuchiha Oct 14, 2025
17c2a92
Get max source Id config from .env and lightRAG init
divineslight Oct 15, 2025
6b37d3c
Merge branch 'feat-entity-size-caps' into limit-vdb-metadata-size
danielaskdd Oct 17, 2025
03333d6
Merge branch 'main' into limit-vdb-metadata-size
danielaskdd Oct 17, 2025
bdadaa6
Merge branch 'main' into limit-vdb-metadata-size
danielaskdd Oct 18, 2025
dc62c78
Add entity/relation chunk tracking with configurable source ID limits
danielaskdd Oct 20, 2025
a9fec26
Add file path limit configuration for entities and relations
danielaskdd Oct 20, 2025
e0fd31a
Fix logging message formatting
danielaskdd Oct 20, 2025
637b850
Add truncation indicator and update property labels in graph view
danielaskdd Oct 20, 2025
e01c998
Track placeholders in file paths for accurate source count display
danielaskdd Oct 20, 2025
665f60b
Refactor entity/relation merge to consolidate VDB operations within f…
danielaskdd Oct 20, 2025
1154c56
Refactor deduplication calculation and remove unused variables
danielaskdd Oct 20, 2025
019dff5
Update truncation message format in properties tooltip
danielaskdd Oct 20, 2025
cd1c48b
Standardize placeholder format to use colon separator consistently
danielaskdd Oct 20, 2025
1248b3a
Increase default limits for source IDs and file paths in metadata
danielaskdd Oct 20, 2025
a525324
Simplify skip logging and reduce pipeline status updates
danielaskdd Oct 20, 2025
be3d274
Refactor node and edge merging logic with improved code structure
danielaskdd Oct 21, 2025
80668aa
Improve file path truncation labels and UI consistency
danielaskdd Oct 21, 2025
3ad616b
Change default source IDs limit method from KEEP to FIFO
danielaskdd Oct 21, 2025
3ed2abd
Improve logging to show source ID ratios when skipping entities/edges
danielaskdd Oct 21, 2025
e5e16b7
Fix Redis data migration error
danielaskdd Oct 21, 2025
88a4552
Increase default max file paths from 30 to 100 and improve documentation
danielaskdd Oct 21, 2025
fe890fc
Improve formatting of limit method info in rebuild functions
danielaskdd Oct 21, 2025
a809245
Preserve file path order by using lists instead of sets
danielaskdd Oct 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions env.example
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,6 @@ ENABLE_LLM_CACHE=true
### control the maximum tokens send to LLM (include entities, relations and chunks)
# MAX_TOTAL_TOKENS=30000

### maximum number of related chunks per source entity or relation
### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
### Higher values increase re-ranking time
# RELATED_CHUNK_NUMBER=5

### chunk selection strategies
### VECTOR: Pick KG chunks by vector similarity, delivered chunks to the LLM aligning more closely with naive retrieval
### WEIGHT: Pick KG chunks by entity and chunk weight, delivered more solely KG related chunks to the LLM
Expand Down Expand Up @@ -140,6 +135,22 @@ SUMMARY_LANGUAGE=English
### Maximum context size sent to LLM for description summary
# SUMMARY_CONTEXT_SIZE=12000

### control the maximum chunk_ids stored in vector and graph db
# MAX_SOURCE_IDS_PER_ENTITY=300
# MAX_SOURCE_IDS_PER_RELATION=300
### control chunk_ids limitation method: FIFO, FIFO
### FIFO: First in first out
### KEEP: Keep oldest (less merge action and faster)
# SOURCE_IDS_LIMIT_METHOD=FIFO

# Maximum number of file paths stored in entity/relation file_path field (For displayed only, does not affect query performance)
# MAX_FILE_PATHS=100

### maximum number of related chunks per source entity or relation
### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
### Higher values increase re-ranking time
# RELATED_CHUNK_NUMBER=5

###############################
### Concurrency Configuration
###############################
Expand Down
2 changes: 1 addition & 1 deletion lightrag/api/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__api_version__ = "0241"
__api_version__ = "0242"
2 changes: 2 additions & 0 deletions lightrag/api/routers/document_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2003,6 +2003,8 @@ async def clear_documents():
rag.full_docs,
rag.full_entities,
rag.full_relations,
rag.entity_chunks,
rag.relation_chunks,
rag.entities_vdb,
rag.relationships_vdb,
rag.chunks_vdb,
Expand Down
8 changes: 8 additions & 0 deletions lightrag/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,14 @@ async def delete(self, ids: list[str]) -> None:
None
"""

@abstractmethod
async def is_empty(self) -> bool:
"""Check if the storage is empty

Returns:
bool: True if storage contains no data, False otherwise
"""


@dataclass
class BaseGraphStorage(StorageNameSpace, ABC):
Expand Down
21 changes: 20 additions & 1 deletion lightrag/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,27 @@
DEFAULT_MIN_RERANK_SCORE = 0.0
DEFAULT_RERANK_BINDING = "null"

# File path configuration for vector and graph database(Should not be changed, used in Milvus Schema)
# Default source ids limit in meta data for entity and relation
DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 300
DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 300
### control chunk_ids limitation method: FIFO, FIFO
### FIFO: First in first out
### KEEP: Keep oldest (less merge action and faster)
SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP"
SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO"
DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_FIFO
VALID_SOURCE_IDS_LIMIT_METHODS = {
SOURCE_IDS_LIMIT_METHOD_KEEP,
SOURCE_IDS_LIMIT_METHOD_FIFO,
}
# Maximum number of file paths stored in entity/relation file_path field (For displayed only, does not affect query performance)
DEFAULT_MAX_FILE_PATHS = 100

# Field length of file_path in Milvus Schema for entity and relation (Should not be changed)
# file_path must store all file paths up to the DEFAULT_MAX_FILE_PATHS limit within the metadata.
DEFAULT_MAX_FILE_PATH_LENGTH = 32768
# Placeholder for more file paths in meta data for entity and relation (Should not be changed)
DEFAULT_FILE_PATH_MORE_PLACEHOLDER = "truncated"

# Default temperature for LLM
DEFAULT_TEMPERATURE = 1.0
Expand Down
14 changes: 14 additions & 0 deletions lightrag/kg/json_doc_status_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,20 @@ async def upsert(self, data: dict[str, dict[str, Any]]) -> None:

await self.index_done_callback()

async def is_empty(self) -> bool:
"""Check if the storage is empty

Returns:
bool: True if storage is empty, False otherwise

Raises:
StorageNotInitializedError: If storage is not initialized
"""
if self._storage_lock is None:
raise StorageNotInitializedError("JsonDocStatusStorage")
async with self._storage_lock:
return len(self._data) == 0

async def get_by_id(self, id: str) -> Union[dict[str, Any], None]:
async with self._storage_lock:
return self._data.get(id)
Expand Down
29 changes: 9 additions & 20 deletions lightrag/kg/json_kv_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,26 +84,6 @@ async def index_done_callback(self) -> None:
write_json(data_dict, self._file_name)
await clear_all_update_flags(self.final_namespace)

async def get_all(self) -> dict[str, Any]:
"""Get all data from storage

Returns:
Dictionary containing all stored data
"""
async with self._storage_lock:
result = {}
for key, value in self._data.items():
if value:
# Create a copy to avoid modifying the original data
data = dict(value)
# Ensure time fields are present, provide default values for old data
data.setdefault("create_time", 0)
data.setdefault("update_time", 0)
result[key] = data
else:
result[key] = value
return result

async def get_by_id(self, id: str) -> dict[str, Any] | None:
async with self._storage_lock:
result = self._data.get(id)
Expand Down Expand Up @@ -200,6 +180,15 @@ async def delete(self, ids: list[str]) -> None:
if any_deleted:
await set_all_update_flags(self.final_namespace)

async def is_empty(self) -> bool:
"""Check if the storage is empty

Returns:
bool: True if storage contains no data, False otherwise
"""
async with self._storage_lock:
return len(self._data) == 0

async def drop(self) -> dict[str, str]:
"""Drop all data from storage and clean up resources
This action will persistent the data to disk immediately.
Expand Down
44 changes: 28 additions & 16 deletions lightrag/kg/mongo_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,22 +175,6 @@ async def filter_keys(self, keys: set[str]) -> set[str]:
existing_ids = {str(x["_id"]) async for x in cursor}
return keys - existing_ids

async def get_all(self) -> dict[str, Any]:
"""Get all data from storage

Returns:
Dictionary containing all stored data
"""
cursor = self._data.find({})
result = {}
async for doc in cursor:
doc_id = doc.pop("_id")
# Ensure time fields are present for all documents
doc.setdefault("create_time", 0)
doc.setdefault("update_time", 0)
result[doc_id] = doc
return result

async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
logger.debug(f"[{self.workspace}] Inserting {len(data)} to {self.namespace}")
if not data:
Expand Down Expand Up @@ -236,6 +220,20 @@ async def index_done_callback(self) -> None:
# Mongo handles persistence automatically
pass

async def is_empty(self) -> bool:
"""Check if the storage is empty for the current workspace and namespace

Returns:
bool: True if storage is empty, False otherwise
"""
try:
# Use count_documents with limit 1 for efficiency
count = await self._data.count_documents({}, limit=1)
return count == 0
except PyMongoError as e:
logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}")
return True

async def delete(self, ids: list[str]) -> None:
"""Delete documents with specified IDs

Expand Down Expand Up @@ -466,6 +464,20 @@ async def index_done_callback(self) -> None:
# Mongo handles persistence automatically
pass

async def is_empty(self) -> bool:
"""Check if the storage is empty for the current workspace and namespace

Returns:
bool: True if storage is empty, False otherwise
"""
try:
# Use count_documents with limit 1 for efficiency
count = await self._data.count_documents({}, limit=1)
return count == 0
except PyMongoError as e:
logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}")
return True

async def drop(self) -> dict[str, str]:
"""Drop the storage by removing all documents in the collection.

Expand Down
Loading