Skip to content

Commit 32732c3

Browse files
cin-cristaprosoft
andauthored
feat: set user being able to set chunk size and overlap for indices (#524)
* use tzlocal to get the local time * delete tmp folder * update date_created and date_updated with current timezone * pass precommit * update date_created field default by local time * add chunk size and chunk overlap param for indices * refactor code to pass pre-commit * fix: minor update logics --------- Co-authored-by: Tadashi <[email protected]>
1 parent a1fecfa commit 32732c3

File tree

3 files changed

+31
-2
lines changed

3 files changed

+31
-2
lines changed

libs/ktem/ktem/index/file/base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ class BaseFileIndexIndexing(BaseComponent):
5555
FSPath = Param(help="The file storage path")
5656
user_id = Param(help="The user id")
5757
private = Param(False, help="Whether this is private index")
58+
chunk_size = Param(help="Chunk size for this index")
59+
chunk_overlap = Param(help="Chunk overlap for this index")
5860

5961
def run(
6062
self, file_paths: str | Path | list[str | Path], *args, **kwargs

libs/ktem/ktem/index/file/index.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,25 @@ def get_admin_settings(cls):
404404
"choices": [("Yes", True), ("No", False)],
405405
"info": "If private, files will not be accessible across users.",
406406
},
407+
"chunk_size": {
408+
"name": "Size of chunk (number of tokens)",
409+
"value": 0,
410+
"component": "number",
411+
"info": (
412+
"Number of tokens of each text segment. "
413+
"Set 0 to use developer setting."
414+
),
415+
},
416+
"chunk_overlap": {
417+
"name": "Number of overlapping tokens between chunks",
418+
"value": 0,
419+
"component": "number",
420+
"info": (
421+
"Number of tokens that consecutive text segments "
422+
"should overlap with each other. "
423+
"Set 0 to use developer setting."
424+
),
425+
},
407426
}
408427

409428
def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing:
@@ -423,6 +442,8 @@ def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing:
423442
obj.FSPath = self._fs_path
424443
obj.user_id = user_id
425444
obj.private = self.config.get("private", False)
445+
obj.chunk_size = self.config.get("chunk_size", 0)
446+
obj.chunk_overlap = self.config.get("chunk_overlap", 0)
426447

427448
return obj
428449

libs/ktem/ktem/index/file/pipelines.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -729,7 +729,11 @@ def route(self, file_path: str | Path) -> IndexPipeline:
729729
730730
Can subclass this method for a more elaborate pipeline routing strategy.
731731
"""
732-
_, chunk_size, chunk_overlap = dev_settings()
732+
733+
_, dev_chunk_size, dev_chunk_overlap = dev_settings()
734+
735+
chunk_size = self.chunk_size or dev_chunk_size
736+
chunk_overlap = self.chunk_overlap or dev_chunk_overlap
733737

734738
# check if file_path is a URL
735739
if self.is_url(file_path):
@@ -744,12 +748,14 @@ def route(self, file_path: str | Path) -> IndexPipeline:
744748
"the suitable pipeline for this file type in the settings."
745749
)
746750

751+
print(f"Chunk size: {chunk_size}, chunk overlap: {chunk_overlap}")
752+
747753
print("Using reader", reader)
748754
pipeline: IndexPipeline = IndexPipeline(
749755
loader=reader,
750756
splitter=TokenSplitter(
751757
chunk_size=chunk_size or 1024,
752-
chunk_overlap=chunk_overlap if chunk_overlap is not None else 256,
758+
chunk_overlap=chunk_overlap or 256,
753759
separator="\n\n",
754760
backup_separators=["\n", ".", "\u200B"],
755761
),

0 commit comments

Comments
 (0)