add test code for splitter & reformat chunking methods

rangehow · rangehow · commit 9900d354f066 · 2024-09-18T20:50:16.000+08:00
diff --git a/main.py b/main.py
@@ -0,0 +1,110 @@
+import os
+import sys
+
+sys.path.append("..")
+import logging
+import numpy as np
+from nano_graphrag import GraphRAG, QueryParam
+from nano_graphrag._utils import wrap_embedding_func_with_attrs
+from sentence_transformers import SentenceTransformer
+from nano_graphrag._op import chunking_by_seperators
+
+logging.basicConfig(level=logging.WARNING)
+logging.getLogger("nano-graphrag").setLevel(logging.INFO)
+
+WORKING_DIR = "/mnt/rangehow/nano-graphrag/neu_cache"
+
+
+
+
+
+
+from openai import AsyncOpenAI
+from nano_graphrag.base import BaseKVStorage
+from nano_graphrag._utils import compute_args_hash
+# CUSTOM LLM
+MODEL="default"
+async def custom_model_if_cache(
+    prompt, system_prompt=None, history_messages=[], **kwargs
+) -> str:
+    openai_async_client = AsyncOpenAI(
+        api_key="EMPTY", base_url="http://152.136.16.221:8203/v1"
+    )
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+
+    # Get the cached response if having-------------------
+    hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
+    messages.extend(history_messages)
+    messages.append({"role": "user", "content": prompt})
+    if hashing_kv is not None:
+        args_hash = compute_args_hash(MODEL, messages)
+        if_cache_return = await hashing_kv.get_by_id(args_hash)
+        if if_cache_return is not None:
+            return if_cache_return["return"]
+    # -----------------------------------------------------
+
+    response = await openai_async_client.chat.completions.create(
+        model=MODEL, messages=messages, temperature=0,**kwargs,
+        timeout=10e6,
+    )
+
+    # Cache the response if having-------------------
+    if hashing_kv is not None:
+        await hashing_kv.upsert(
+            {args_hash: {"return": response.choices[0].message.content, "model": MODEL}}
+        )
+        await hashing_kv.index_done_callback()
+    # -----------------------------------------------------
+    return response.choices[0].message.content
+
+
+
+
+
+
+
+
+# CUSTOM EMBEDDING
+
+EMBED_MODEL = SentenceTransformer(
+    "/mnt/rangehow/models/Conan-embedding-v1", cache_folder=WORKING_DIR, device="cpu"
+)
+
+
+# We're using Sentence Transformers to generate embeddings for the BGE model
+@wrap_embedding_func_with_attrs(
+    embedding_dim=EMBED_MODEL.get_sentence_embedding_dimension(),
+    max_token_size=EMBED_MODEL.max_seq_length,
+)
+async def local_embedding(texts: list[str]) -> np.ndarray:
+    return EMBED_MODEL.encode(texts, normalize_embeddings=True)
+
+
+rag = GraphRAG(
+    working_dir=WORKING_DIR,
+    embedding_func=local_embedding,
+    enable_llm_cache=True,
+    best_model_func=custom_model_if_cache,
+    cheap_model_func=custom_model_if_cache,
+    chunk_func=chunking_by_seperators,
+    best_model_max_async=1024,
+    cheap_model_max_async=1024,
+    entity_extract_max_gleaning=0,
+)
+
+documents=[]
+input_directory="/mnt/rangehow/neuspider/document/markdown_saved"
+filenames = [f for f in os.listdir(input_directory) if os.path.isfile(os.path.join(input_directory, f))]
+for filename in filenames:
+    with open(os.path.join(input_directory,filename), encoding="utf-8") as f:
+        string=f.read()
+        if len(string)<50:
+            continue
+        documents.append(string)
+
+print(len(documents))
+
+rag.insert(documents)
+print(rag.query("东北大学谁最牛逼？", param=QueryParam(mode="global")))
diff --git a/nano_graphrag/_op.py b/nano_graphrag/_op.py
@@ -107,6 +107,25 @@ def chunking_by_seperators(tokens_list: list[int], doc_keys,tiktoken_model, over
 
     return results
 
+
+def get_chunks(new_docs,chunk_func=chunking_by_token_size,**chunk_func_params):
+    inserting_chunks = {}
+
+    new_docs_list=list(new_docs.items())
+    docs=[new_doc[1]["content"] for new_doc in new_docs_list]
+    doc_keys=[new_doc[0] for new_doc in new_docs_list]
+    
+    
+    ENCODER = tiktoken.encoding_for_model("gpt-4o")
+    tokens=ENCODER.encode_batch(docs,num_threads=16)
+    chunks=chunk_func(tokens,doc_keys=doc_keys,tiktoken_model=ENCODER,**chunk_func_params)
+    
+    for chunk in chunks:
+        inserting_chunks.update({compute_mdhash_id(chunk["content"], prefix="chunk-"):chunk})
+    
+    return inserting_chunks
+
+
 async def _handle_entity_relation_summary(
     entity_or_relation_name: str,
     description: str,
diff --git a/nano_graphrag/_spliter.py b/nano_graphrag/_spliter.py
@@ -9,7 +9,7 @@ def __init__(
         chunk_overlap: int = 200,
         length_function: callable = len,
     ):
-        self._separators = separators or [[10], [13, 10]]  # 默认使用换行符作为分隔符
+        self._separators = separators or []
         self._keep_separator = keep_separator
         self._chunk_size = chunk_size
         self._chunk_overlap = chunk_overlap
@@ -27,18 +27,12 @@ def _split_tokens_with_separators(self, tokens: List[int]) -> List[List[int]]:
             separator_found = False
             for separator in self._separators:
                 if tokens[i:i+len(separator)] == separator:
+                    if self._keep_separator in [True, "end"]:
+                        current_split.extend(separator)
                     if current_split:
-                        if self._keep_separator == "end":
-                            current_split.extend(separator)
-                            splits.append(current_split)
-                            current_split = []
-                        elif self._keep_separator == "start":
-                            splits.append(current_split)
-                            current_split = separator[:]
-                        else:
-                            splits.append(current_split)
-                            current_split = []
-                    elif self._keep_separator:
+                        splits.append(current_split)
+                        current_split = []
+                    if self._keep_separator == "start":
                         current_split.extend(separator)
                     i += len(separator)
                     separator_found = True
@@ -51,71 +45,48 @@ def _split_tokens_with_separators(self, tokens: List[int]) -> List[List[int]]:
         return [s for s in splits if s]
 
     def _merge_splits(self, splits: List[List[int]]) -> List[List[int]]:
+        if not splits:
+            return []
+
         merged_splits = []
-        current_split = []
-        current_length = 0
-        separator = [] if self._keep_separator is False else self._separators[-1]
+        current_chunk = []
 
         for split in splits:
-            if self._length_function(current_split) + self._length_function(split) <= self._chunk_size:
-                if current_split and separator:
-                    current_split.extend(separator)
-                current_split.extend(split)
+            if not current_chunk:
+                current_chunk = split
+            elif self._length_function(current_chunk) + self._length_function(split) <= self._chunk_size:
+                current_chunk.extend(split)
             else:
-                if current_split:
-                    merged_splits.append(current_split)
-                current_split = split
-            if self._length_function(current_split) >= self._chunk_size:
-                merged_splits.append(current_split)
-                current_split = []
-        if current_split:
-            merged_splits.append(current_split)
+                merged_splits.append(current_chunk)
+                current_chunk = split
+
+        if current_chunk:
+            merged_splits.append(current_chunk)
+
+        if len(merged_splits) == 1 and self._length_function(merged_splits[0]) > self._chunk_size:
+            return self._split_chunk(merged_splits[0])
 
         if self._chunk_overlap > 0:
             return self._enforce_overlap(merged_splits)
+        
         return merged_splits
 
+    def _split_chunk(self, chunk: List[int]) -> List[List[int]]:
+        result = []
+        for i in range(0, len(chunk), self._chunk_size - self._chunk_overlap):
+            result.append(chunk[i:i + self._chunk_size])
+        return result
+
     def _enforce_overlap(self, chunks: List[List[int]]) -> List[List[int]]:
-        new_chunks = []
+        result = []
         for i, chunk in enumerate(chunks):
             if i == 0:
-                new_chunks.append(chunk)
+                result.append(chunk)
             else:
-                overlap_tokens = chunks[i-1][-self._chunk_overlap:]
-                new_chunk = overlap_tokens + chunk
+                overlap = chunks[i-1][-self._chunk_overlap:]
+                new_chunk = overlap + chunk
                 if self._length_function(new_chunk) > self._chunk_size:
-                    new_chunk = new_chunk[-self._chunk_size:]
-                new_chunks.append(new_chunk)
-        return new_chunks
-
-# EXAMPLE USAGE
-if __name__ == "__main__":
-    import tiktoken
-    tokenizer = tiktoken.encoding_for_model("gpt-4")
-
-    def tokenize(text: str) -> List[int]:
-        return tokenizer.encode(text)
-
-    def detokenize(tokens: List[int]) -> str:
-        return tokenizer.decode(tokens)
-    
-    # 创建splitter实例
-    splitter = SeparatorSplitter(
-        separators=[tokenize('\n'), tokenize('.')],  # 使用换行符和句号作为分隔符
-        chunk_size=5,
-        chunk_overlap=0,
-        keep_separator="end"
-    )
-
-    # 示例文本
-    text = "This is a sample text. It contains multiple sentences.\nSome sentences are short. Others are longer."
-    tokens = tokenize(text)
-
-    # 分割tokens
-    split_tokens = splitter.split_tokens(tokens)
+                    new_chunk = new_chunk[:self._chunk_size]
+                result.append(new_chunk)
+        return result
 
-    print("Split tokens:")
-    for i, token_chunk in enumerate(split_tokens):
-        print(f"Chunk {i + 1}:")
-        print(detokenize(token_chunk))
-        print("---")
diff --git a/nano_graphrag/graphrag.py b/nano_graphrag/graphrag.py
@@ -20,6 +20,7 @@
     chunking_by_token_size,
     extract_entities,
     generate_community_report,
+    get_chunks,
     local_query,
     global_query,
     naive_query,
@@ -265,21 +266,9 @@ async def ainsert(self, string_or_strings):
             logger.info(f"[New Docs] inserting {len(new_docs)} docs")
 
             # ---------- chunking
-            inserting_chunks = {}
             
-
-            
-            new_docs_list=list(new_docs.items())
-            docs=[new_doc[1]["content"] for new_doc in new_docs_list]
-            doc_keys=[new_doc[0] for new_doc in new_docs_list]
-            
-            
-            ENCODER = tiktoken.encoding_for_model("gpt-4o")
-            tokens=ENCODER.encode_batch(docs,num_threads=16)
-            chunks=self.chunk_func(tokens,overlap_token_size=self.chunk_overlap_token_size,
-                        max_token_size=self.chunk_token_size,doc_keys=doc_keys,tiktoken_model=ENCODER)
-            for chunk in chunks:
-                inserting_chunks.update({compute_mdhash_id(chunk["content"], prefix="chunk-"):chunk})
+            inserting_chunks = get_chunks(new_docs=new_docs,chunk_func=self.chunk_func,overlap_token_size=self.chunk_overlap_token_size,
+            max_token_size=self.chunk_token_size)
             
 
             _add_chunk_keys = await self.text_chunks.filter_keys(
diff --git a/tests/test_splitter.py b/tests/test_splitter.py
@@ -0,0 +1,65 @@
+import unittest
+from typing import List
+
+from nano_graphrag._spliter import SeparatorSplitter
+
+# Assuming the SeparatorSplitter class is already imported
+
+class TestSeparatorSplitter(unittest.TestCase):
+
+    def setUp(self):
+        self.tokenize = lambda text: [ord(c) for c in text]  # Simple tokenizer for testing
+        self.detokenize = lambda tokens: ''.join(chr(t) for t in tokens)
+
+    def test_split_with_custom_separator(self):
+        splitter = SeparatorSplitter(
+            separators=[self.tokenize('\n'), self.tokenize('.')],
+            chunk_size=19,
+            chunk_overlap=0,
+            keep_separator="end"
+        )
+        text = "This is a test.\nAnother test."
+        tokens = self.tokenize(text)
+        expected = [
+            self.tokenize("This is a test.\n"),
+            self.tokenize("Another test."),
+        ]
+        result = splitter.split_tokens(tokens)
+        import pdb
+        pdb.set_trace()
+        self.assertEqual(result, expected)
+
+    def test_chunk_size_limit(self):
+        splitter = SeparatorSplitter(
+            chunk_size=5,
+            chunk_overlap=0,
+            separators=[self.tokenize("\n")]
+        )
+        text = "1234567890"
+        tokens = self.tokenize(text)
+        expected = [
+            self.tokenize("12345"),
+            self.tokenize("67890")
+        ]
+        result = splitter.split_tokens(tokens)
+        self.assertEqual(result, expected)
+
+    def test_chunk_overlap(self):
+        splitter = SeparatorSplitter(
+            chunk_size=5,
+            chunk_overlap=2,
+            separators=[self.tokenize("\n")]
+        )
+        text = "1234567890"
+        tokens = self.tokenize(text)
+        expected = [
+            self.tokenize("12345"),
+            self.tokenize("45678"),
+            self.tokenize("7890"),
+            self.tokenize("0")
+        ]
+        result = splitter.split_tokens(tokens)
+        self.assertEqual(result, expected)
+
+if __name__ == '__main__':
+    unittest.main()