Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions lightrag/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,27 @@ def __init__(self, message: str = "User cancelled"):
self.message = message


class ChunkTokenLimitExceededError(ValueError):
"""Raised when a chunk exceeds the configured token limit."""

def __init__(
self,
chunk_tokens: int,
chunk_token_limit: int,
chunk_preview: str | None = None,
) -> None:
preview = chunk_preview.strip() if chunk_preview else None
truncated_preview = preview[:80] if preview else None
preview_note = f" Preview: '{truncated_preview}'" if truncated_preview else ""
super().__init__(
f"Chunk token length {chunk_tokens} exceeds chunk_token_size {chunk_token_limit}."
f"{preview_note}"
)
self.chunk_tokens = chunk_tokens
self.chunk_token_limit = chunk_token_limit
self.chunk_preview = truncated_preview


class QdrantMigrationError(Exception):
"""Raised when Qdrant data migration from legacy collections fails."""

Expand Down
16 changes: 15 additions & 1 deletion lightrag/operate.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@
from typing import Any, AsyncIterator, overload, Literal
from collections import Counter, defaultdict

from lightrag.exceptions import PipelineCancelledException
from lightrag.exceptions import (
PipelineCancelledException,
ChunkTokenLimitExceededError,
)
from lightrag.utils import (
logger,
compute_mdhash_id,
Expand Down Expand Up @@ -109,6 +112,17 @@ def chunking_by_token_size(
if split_by_character_only:
for chunk in raw_chunks:
_tokens = tokenizer.encode(chunk)
if len(_tokens) > chunk_token_size:
logger.warning(
"Chunk split_by_character exceeds token limit: len=%d limit=%d",
len(_tokens),
chunk_token_size,
)
raise ChunkTokenLimitExceededError(
chunk_tokens=len(_tokens),
chunk_token_limit=chunk_token_size,
chunk_preview=chunk[:120],
)
new_chunks.append((len(_tokens), chunk))
else:
for chunk in raw_chunks:
Expand Down
113 changes: 113 additions & 0 deletions tests/test_chunking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import pytest

from lightrag.exceptions import ChunkTokenLimitExceededError
from lightrag.operate import chunking_by_token_size
from lightrag.utils import Tokenizer, TokenizerInterface


class DummyTokenizer(TokenizerInterface):
def encode(self, content: str):
return [ord(ch) for ch in content]

def decode(self, tokens):
return "".join(chr(token) for token in tokens)


def make_tokenizer() -> Tokenizer:
return Tokenizer(model_name="dummy", tokenizer=DummyTokenizer())


@pytest.mark.offline
def test_split_by_character_only_within_limit():
"""Test chunking when all chunks are within token limit."""
tokenizer = make_tokenizer()

chunks = chunking_by_token_size(
tokenizer,
"alpha\n\nbeta",
split_by_character="\n\n",
split_by_character_only=True,
chunk_token_size=10,
)

assert [chunk["content"] for chunk in chunks] == ["alpha", "beta"]


@pytest.mark.offline
def test_split_by_character_only_exceeding_limit_raises():
"""Test that oversized chunks raise ChunkTokenLimitExceededError."""
tokenizer = make_tokenizer()
oversized = "a" * 12

with pytest.raises(ChunkTokenLimitExceededError) as excinfo:
chunking_by_token_size(
tokenizer,
oversized,
split_by_character="\n\n",
split_by_character_only=True,
chunk_token_size=5,
)

err = excinfo.value
assert err.chunk_tokens == len(oversized)
assert err.chunk_token_limit == 5


@pytest.mark.offline
def test_chunk_error_includes_preview():
"""Test that error message includes chunk preview."""
tokenizer = make_tokenizer()
oversized = "x" * 100

with pytest.raises(ChunkTokenLimitExceededError) as excinfo:
chunking_by_token_size(
tokenizer,
oversized,
split_by_character="\n\n",
split_by_character_only=True,
chunk_token_size=10,
)

err = excinfo.value
# Preview should be first 80 chars of a 100-char string
assert err.chunk_preview == "x" * 80
assert "Preview:" in str(err)


@pytest.mark.offline
def test_split_by_character_only_at_exact_limit():
"""Test chunking when chunk is exactly at token limit."""
tokenizer = make_tokenizer()
exact_size = "a" * 10

chunks = chunking_by_token_size(
tokenizer,
exact_size,
split_by_character="\n\n",
split_by_character_only=True,
chunk_token_size=10,
)

assert len(chunks) == 1
assert chunks[0]["content"] == exact_size
assert chunks[0]["tokens"] == 10


@pytest.mark.offline
def test_split_by_character_only_one_over_limit():
"""Test that chunk with one token over limit raises error."""
tokenizer = make_tokenizer()
one_over = "a" * 11

with pytest.raises(ChunkTokenLimitExceededError) as excinfo:
chunking_by_token_size(
tokenizer,
one_over,
split_by_character="\n\n",
split_by_character_only=True,
chunk_token_size=10,
)

err = excinfo.value
assert err.chunk_tokens == 11
assert err.chunk_token_limit == 10
Loading