feat: allow custom chunking method (gusye1234#40)

rangehow · web-flow · commit 801c276c4285 · 2024-09-17T18:16:12.000+08:00
diff --git a/examples/using_custom_chunking_method.py b/examples/using_custom_chunking_method.py
@@ -0,0 +1,68 @@
+
+
+from nano_graphrag._utils import encode_string_by_tiktoken
+from nano_graphrag.base import QueryParam
+from nano_graphrag.graphrag import GraphRAG
+
+
+def chunking_by_specific_separators(
+    content: str, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o",
+):
+    from langchain_text_splitters  import RecursiveCharacterTextSplitter
+    
+
+    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=max_token_size,
+        chunk_overlap=overlap_token_size,
+        # length_function=lambda x: len(encode_string_by_tiktoken(x)),
+        model_name=tiktoken_model,
+        is_separator_regex=False,
+        separators=[
+            # Paragraph separators
+            "\n\n",
+            "\r\n\r\n",
+            # Line breaks
+            "\n",
+            "\r\n",
+            # Sentence ending punctuation
+            "。",  # Chinese period
+            "．",  # Full-width dot
+            ".",  # English period
+            "！",  # Chinese exclamation mark
+            "!",  # English exclamation mark
+            "？",  # Chinese question mark
+            "?",  # English question mark
+            # Whitespace characters
+            " ",  # Space
+            "\t",  # Tab
+            "\u3000",  # Full-width space
+            # Special characters
+            "\u200b",  # Zero-width space (used in some Asian languages)
+            # Final fallback
+            "",
+        ])
+    texts = text_splitter.split_text(content)
+    
+    results = []
+    for index, chunk_content in enumerate(texts):
+        
+        results.append(
+            {
+                # "tokens": None,
+                "content": chunk_content.strip(),
+                "chunk_order_index": index,
+            }
+        )
+    return results
+
+
+WORKING_DIR = "./nano_graphrag_cache_local_embedding_TEST"
+rag = GraphRAG(
+    working_dir=WORKING_DIR,
+    chunk_func=chunking_by_specific_separators,
+)
+
+with open("../tests/mock_data.txt", encoding="utf-8-sig") as f:
+    FAKE_TEXT = f.read()
+
+# rag.insert(FAKE_TEXT)
+print(rag.query("What the main theme of this story?", param=QueryParam(mode="local")))
diff --git a/nano_graphrag/_op.py b/nano_graphrag/_op.py
@@ -49,6 +49,9 @@ def chunking_by_token_size(
     return results
 
 
+
+
+
 async def _handle_entity_relation_summary(
     entity_or_relation_name: str,
     description: str,
diff --git a/nano_graphrag/graphrag.py b/nano_graphrag/graphrag.py
@@ -3,7 +3,7 @@
 from dataclasses import asdict, dataclass, field
 from datetime import datetime
 from functools import partial
-from typing import Type, cast
+from typing import Callable, Dict, List, Optional, Type, Union, cast
 
 
 from ._llm import (
@@ -65,6 +65,7 @@ class GraphRAG:
     enable_naive_rag: bool = False
 
     # text chunking
+    chunk_func: Callable[[str, Optional[int], Optional[int], Optional[str]], List[Dict[str, Union[str, int]]]] = chunking_by_token_size
     chunk_token_size: int = 1200
     chunk_overlap_token_size: int = 100
     tiktoken_model_name: str = "gpt-4o"
@@ -269,7 +270,7 @@ async def ainsert(self, string_or_strings):
                         **dp,
                         "full_doc_id": doc_key,
                     }
-                    for dp in chunking_by_token_size(
+                    for dp in self.chunk_func(
                         doc["content"],
                         overlap_token_size=self.chunk_overlap_token_size,
                         max_token_size=self.chunk_token_size,