Skip to content

Commit 801c276

Browse files
authored
feat: allow custom chunking method (gusye1234#40)
1 parent 7910840 commit 801c276

File tree

3 files changed

+74
-2
lines changed

3 files changed

+74
-2
lines changed
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
2+
3+
from nano_graphrag._utils import encode_string_by_tiktoken
4+
from nano_graphrag.base import QueryParam
5+
from nano_graphrag.graphrag import GraphRAG
6+
7+
8+
def chunking_by_specific_separators(
9+
content: str, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o",
10+
):
11+
from langchain_text_splitters import RecursiveCharacterTextSplitter
12+
13+
14+
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=max_token_size,
15+
chunk_overlap=overlap_token_size,
16+
# length_function=lambda x: len(encode_string_by_tiktoken(x)),
17+
model_name=tiktoken_model,
18+
is_separator_regex=False,
19+
separators=[
20+
# Paragraph separators
21+
"\n\n",
22+
"\r\n\r\n",
23+
# Line breaks
24+
"\n",
25+
"\r\n",
26+
# Sentence ending punctuation
27+
"。", # Chinese period
28+
".", # Full-width dot
29+
".", # English period
30+
"!", # Chinese exclamation mark
31+
"!", # English exclamation mark
32+
"?", # Chinese question mark
33+
"?", # English question mark
34+
# Whitespace characters
35+
" ", # Space
36+
"\t", # Tab
37+
"\u3000", # Full-width space
38+
# Special characters
39+
"\u200b", # Zero-width space (used in some Asian languages)
40+
# Final fallback
41+
"",
42+
])
43+
texts = text_splitter.split_text(content)
44+
45+
results = []
46+
for index, chunk_content in enumerate(texts):
47+
48+
results.append(
49+
{
50+
# "tokens": None,
51+
"content": chunk_content.strip(),
52+
"chunk_order_index": index,
53+
}
54+
)
55+
return results
56+
57+
58+
WORKING_DIR = "./nano_graphrag_cache_local_embedding_TEST"
59+
rag = GraphRAG(
60+
working_dir=WORKING_DIR,
61+
chunk_func=chunking_by_specific_separators,
62+
)
63+
64+
with open("../tests/mock_data.txt", encoding="utf-8-sig") as f:
65+
FAKE_TEXT = f.read()
66+
67+
# rag.insert(FAKE_TEXT)
68+
print(rag.query("What the main theme of this story?", param=QueryParam(mode="local")))

nano_graphrag/_op.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ def chunking_by_token_size(
4949
return results
5050

5151

52+
53+
54+
5255
async def _handle_entity_relation_summary(
5356
entity_or_relation_name: str,
5457
description: str,

nano_graphrag/graphrag.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from dataclasses import asdict, dataclass, field
44
from datetime import datetime
55
from functools import partial
6-
from typing import Type, cast
6+
from typing import Callable, Dict, List, Optional, Type, Union, cast
77

88

99
from ._llm import (
@@ -65,6 +65,7 @@ class GraphRAG:
6565
enable_naive_rag: bool = False
6666

6767
# text chunking
68+
chunk_func: Callable[[str, Optional[int], Optional[int], Optional[str]], List[Dict[str, Union[str, int]]]] = chunking_by_token_size
6869
chunk_token_size: int = 1200
6970
chunk_overlap_token_size: int = 100
7071
tiktoken_model_name: str = "gpt-4o"
@@ -269,7 +270,7 @@ async def ainsert(self, string_or_strings):
269270
**dp,
270271
"full_doc_id": doc_key,
271272
}
272-
for dp in chunking_by_token_size(
273+
for dp in self.chunk_func(
273274
doc["content"],
274275
overlap_token_size=self.chunk_overlap_token_size,
275276
max_token_size=self.chunk_token_size,

0 commit comments

Comments
 (0)