Merge pull request #2389 from danielaskdd/fix-chunk-size

danielaskdd · web-flow · commit f72f435cef0e · 2025-11-19T20:34:11.000+08:00
Fix: Add chunk token limit validation with detailed error reporting
diff --git a/lightrag/exceptions.py b/lightrag/exceptions.py
@@ -106,6 +106,28 @@ def __init__(self, message: str = "User cancelled"):
         self.message = message
 
 
+class ChunkTokenLimitExceededError(ValueError):
+    """Raised when a chunk exceeds the configured token limit."""
+
+    def __init__(
+        self,
+        chunk_tokens: int,
+        chunk_token_limit: int,
+        chunk_preview: str | None = None,
+    ) -> None:
+        preview = chunk_preview.strip() if chunk_preview else None
+        truncated_preview = preview[:80] if preview else None
+        preview_note = f" Preview: '{truncated_preview}'" if truncated_preview else ""
+        message = (
+            f"Chunk token length {chunk_tokens} exceeds chunk_token_size {chunk_token_limit}."
+            f"{preview_note}"
+        )
+        super().__init__(message)
+        self.chunk_tokens = chunk_tokens
+        self.chunk_token_limit = chunk_token_limit
+        self.chunk_preview = truncated_preview
+
+
 class QdrantMigrationError(Exception):
     """Raised when Qdrant data migration from legacy collections fails."""
 
diff --git a/lightrag/operate.py b/lightrag/operate.py
@@ -8,7 +8,10 @@
 from typing import Any, AsyncIterator, overload, Literal
 from collections import Counter, defaultdict
 
-from lightrag.exceptions import PipelineCancelledException
+from lightrag.exceptions import (
+    PipelineCancelledException,
+    ChunkTokenLimitExceededError,
+)
 from lightrag.utils import (
     logger,
     compute_mdhash_id,
@@ -109,6 +112,17 @@ def chunking_by_token_size(
         if split_by_character_only:
             for chunk in raw_chunks:
                 _tokens = tokenizer.encode(chunk)
+                if len(_tokens) > chunk_token_size:
+                    logger.warning(
+                        "Chunk split_by_character exceeds token limit: len=%d limit=%d",
+                        len(_tokens),
+                        chunk_token_size,
+                    )
+                    raise ChunkTokenLimitExceededError(
+                        chunk_tokens=len(_tokens),
+                        chunk_token_limit=chunk_token_size,
+                        chunk_preview=chunk[:120],
+                    )
                 new_chunks.append((len(_tokens), chunk))
         else:
             for chunk in raw_chunks:
diff --git a/tests/test_chunking.py b/tests/test_chunking.py