Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions examples/using_custom_chunking_method.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@


from nano_graphrag._utils import encode_string_by_tiktoken
from nano_graphrag.base import QueryParam
from nano_graphrag.graphrag import GraphRAG


def chunking_by_specific_separators(
content: str, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o",
):
from langchain_text_splitters import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=max_token_size,
chunk_overlap=overlap_token_size,
# length_function=lambda x: len(encode_string_by_tiktoken(x)),
model_name=tiktoken_model,
is_separator_regex=False,
separators=[
# Paragraph separators
"\n\n",
"\r\n\r\n",
# Line breaks
"\n",
"\r\n",
# Sentence ending punctuation
"。", # Chinese period
".", # Full-width dot
".", # English period
"!", # Chinese exclamation mark
"!", # English exclamation mark
"?", # Chinese question mark
"?", # English question mark
# Whitespace characters
" ", # Space
"\t", # Tab
"\u3000", # Full-width space
# Special characters
"\u200b", # Zero-width space (used in some Asian languages)
# Final fallback
"",
])
texts = text_splitter.split_text(content)

results = []
for index, chunk_content in enumerate(texts):

results.append(
{
# "tokens": None,
"content": chunk_content.strip(),
"chunk_order_index": index,
}
)
return results


WORKING_DIR = "./nano_graphrag_cache_local_embedding_TEST"
rag = GraphRAG(
working_dir=WORKING_DIR,
chunk_func=chunking_by_specific_separators,
)

with open("../tests/mock_data.txt", encoding="utf-8-sig") as f:
FAKE_TEXT = f.read()

# rag.insert(FAKE_TEXT)
print(rag.query("What the main theme of this story?", param=QueryParam(mode="local")))
3 changes: 3 additions & 0 deletions nano_graphrag/_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ def chunking_by_token_size(
return results





async def _handle_entity_relation_summary(
entity_or_relation_name: str,
description: str,
Expand Down
5 changes: 3 additions & 2 deletions nano_graphrag/graphrag.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from dataclasses import asdict, dataclass, field
from datetime import datetime
from functools import partial
from typing import Type, cast
from typing import Callable, Dict, List, Optional, Type, Union, cast


from ._llm import (
Expand Down Expand Up @@ -65,6 +65,7 @@ class GraphRAG:
enable_naive_rag: bool = False

# text chunking
chunk_func: Callable[[str, Optional[int], Optional[int], Optional[str]], List[Dict[str, Union[str, int]]]] = chunking_by_token_size
chunk_token_size: int = 1200
chunk_overlap_token_size: int = 100
tiktoken_model_name: str = "gpt-4o"
Expand Down Expand Up @@ -269,7 +270,7 @@ async def ainsert(self, string_or_strings):
**dp,
"full_doc_id": doc_key,
}
for dp in chunking_by_token_size(
for dp in self.chunk_func(
doc["content"],
overlap_token_size=self.chunk_overlap_token_size,
max_token_size=self.chunk_token_size,
Expand Down