Skip to content

Commit f72f435

Browse files
authored
Merge pull request #2389 from danielaskdd/fix-chunk-size
Fix: Add chunk token limit validation with detailed error reporting
2 parents 5cc9168 + fec7c67 commit f72f435

File tree

3 files changed

+1103
-1
lines changed

3 files changed

+1103
-1
lines changed

lightrag/exceptions.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,28 @@ def __init__(self, message: str = "User cancelled"):
106106
self.message = message
107107

108108

109+
class ChunkTokenLimitExceededError(ValueError):
110+
"""Raised when a chunk exceeds the configured token limit."""
111+
112+
def __init__(
113+
self,
114+
chunk_tokens: int,
115+
chunk_token_limit: int,
116+
chunk_preview: str | None = None,
117+
) -> None:
118+
preview = chunk_preview.strip() if chunk_preview else None
119+
truncated_preview = preview[:80] if preview else None
120+
preview_note = f" Preview: '{truncated_preview}'" if truncated_preview else ""
121+
message = (
122+
f"Chunk token length {chunk_tokens} exceeds chunk_token_size {chunk_token_limit}."
123+
f"{preview_note}"
124+
)
125+
super().__init__(message)
126+
self.chunk_tokens = chunk_tokens
127+
self.chunk_token_limit = chunk_token_limit
128+
self.chunk_preview = truncated_preview
129+
130+
109131
class QdrantMigrationError(Exception):
110132
"""Raised when Qdrant data migration from legacy collections fails."""
111133

lightrag/operate.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@
88
from typing import Any, AsyncIterator, overload, Literal
99
from collections import Counter, defaultdict
1010

11-
from lightrag.exceptions import PipelineCancelledException
11+
from lightrag.exceptions import (
12+
PipelineCancelledException,
13+
ChunkTokenLimitExceededError,
14+
)
1215
from lightrag.utils import (
1316
logger,
1417
compute_mdhash_id,
@@ -109,6 +112,17 @@ def chunking_by_token_size(
109112
if split_by_character_only:
110113
for chunk in raw_chunks:
111114
_tokens = tokenizer.encode(chunk)
115+
if len(_tokens) > chunk_token_size:
116+
logger.warning(
117+
"Chunk split_by_character exceeds token limit: len=%d limit=%d",
118+
len(_tokens),
119+
chunk_token_size,
120+
)
121+
raise ChunkTokenLimitExceededError(
122+
chunk_tokens=len(_tokens),
123+
chunk_token_limit=chunk_token_size,
124+
chunk_preview=chunk[:120],
125+
)
112126
new_chunks.append((len(_tokens), chunk))
113127
else:
114128
for chunk in raw_chunks:

0 commit comments

Comments
 (0)