Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion flowsettings.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@
}

USE_NANO_GRAPHRAG = config("USE_NANO_GRAPHRAG", default=False, cast=bool)
USE_LIGHTRAG = config("USE_LIGHTRAG", default=False, cast=bool)
USE_LIGHTRAG = config("USE_LIGHTRAG", default=True, cast=bool)

GRAPHRAG_INDEX_TYPES = ["ktem.index.file.graph.GraphRAGIndex"]

Expand Down
6 changes: 4 additions & 2 deletions libs/ktem/ktem/index/file/graph/lightrag_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
filestorage_path = Path(settings.KH_FILESTORAGE_PATH) / "lightrag"
filestorage_path.mkdir(parents=True, exist_ok=True)

INDEX_BATCHSIZE = 2
INDEX_BATCHSIZE = 4


def get_llm_func(model):
Expand Down Expand Up @@ -268,7 +268,9 @@ def call_graphrag_index(self, graph_id: str, docs: list[Document]):

for doc_id in range(0, len(all_docs), INDEX_BATCHSIZE):
cur_docs = all_docs[doc_id : doc_id + INDEX_BATCHSIZE]
graphrag_func.insert(cur_docs)
combined_doc = "\n".join(cur_docs)

graphrag_func.insert(combined_doc)
process_doc_count += len(cur_docs)
yield Document(
channel="debug",
Expand Down
4 changes: 3 additions & 1 deletion libs/ktem/ktem/index/file/graph/nano_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,9 @@ def call_graphrag_index(self, graph_id: str, docs: list[Document]):
)
for doc_id in range(0, len(all_docs), INDEX_BATCHSIZE):
cur_docs = all_docs[doc_id : doc_id + INDEX_BATCHSIZE]
graphrag_func.insert(cur_docs)
combined_doc = "\n".join(cur_docs)

graphrag_func.insert(combined_doc)
process_doc_count += len(cur_docs)
yield Document(
channel="debug",
Expand Down
15 changes: 15 additions & 0 deletions libs/ktem/ktem/index/file/graph/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@
filestorage_path = Path(settings.KH_FILESTORAGE_PATH) / "graphrag"
filestorage_path.mkdir(parents=True, exist_ok=True)

GRAPHRAG_KEY_MISSING_MESSAGE = (
"GRAPHRAG_API_KEY is not set. Please set it to use the GraphRAG retriever pipeline."
)


def check_graphrag_api_key():
return len(os.getenv("GRAPHRAG_API_KEY", "")) > 0


def prepare_graph_index_path(graph_id: str):
root_path = Path(filestorage_path) / graph_id
Expand Down Expand Up @@ -99,6 +107,9 @@ def write_docs_to_files(self, graph_id: str, docs: list[Document]):
return root_path

def call_graphrag_index(self, graph_id: str, all_docs: list[Document]):
if not check_graphrag_api_key():
raise ValueError(GRAPHRAG_KEY_MISSING_MESSAGE)

# call GraphRAG index with docs and graph_id
input_path = self.write_docs_to_files(graph_id, all_docs)
input_path = str(input_path.absolute())
Expand Down Expand Up @@ -346,6 +357,10 @@ def run(
) -> list[RetrievedDocument]:
if not self.file_ids:
return []

if not check_graphrag_api_key():
raise ValueError(GRAPHRAG_KEY_MISSING_MESSAGE)

context_builder = self._build_graph_search()

local_context_params = {
Expand Down
5 changes: 5 additions & 0 deletions libs/ktem/ktem/index/file/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,6 +683,11 @@ def on_register_events(self):
if self._index.id == 1:
self.quick_upload_state = gr.State(value=[])
print("Setting up quick upload event")

# override indexing function from chat page
self._app.chat_page.first_indexing_url_fn = (
self.index_fn_url_with_default_loaders
)
quickUploadedEvent = (
self._app.chat_page.quick_file_upload.upload(
fn=lambda: gr.update(
Expand Down
37 changes: 31 additions & 6 deletions libs/ktem/ktem/pages/chat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from kotaemon.base import Document
from kotaemon.indices.ingests.files import KH_DEFAULT_FILE_EXTRACTORS

from ...utils import SUPPORTED_LANGUAGE_MAP, get_file_names_regex
from ...utils import SUPPORTED_LANGUAGE_MAP, get_file_names_regex, get_urls
from .chat_panel import ChatPanel
from .common import STATE
from .control import ConversationControl
Expand Down Expand Up @@ -140,6 +140,7 @@ def on_building_ui(self):
# get the file selector choices for the first index
if index_id == 0:
self.first_selector_choices = index_ui.selector_choices
self.first_indexing_url_fn = None

if gr_index:
if isinstance(gr_index, list):
Expand Down Expand Up @@ -284,6 +285,7 @@ def on_register_events(self):
self.chat_panel.text_input,
self.chat_panel.chatbot,
self._app.user_id,
self._app.settings_state,
self.chat_control.conversation_id,
self.chat_control.conversation_rn,
self.first_selector_choices,
Expand Down Expand Up @@ -634,6 +636,7 @@ def submit_msg(
chat_input,
chat_history,
user_id,
settings,
conv_id,
conv_name,
first_selector_choices,
Expand All @@ -643,22 +646,44 @@ def submit_msg(
raise ValueError("Input is empty")

chat_input_text = chat_input.get("text", "")
file_ids = []

# get all file names with pattern @"filename" in input_str
file_names, chat_input_text = get_file_names_regex(chat_input_text)
first_selector_choices_map = {
item[0]: item[1] for item in first_selector_choices
}
file_ids = []

if file_names:
# get all file names with pattern @"filename" in input_str
file_names, chat_input_text = get_file_names_regex(chat_input_text)
# get all urls in input_str
urls, chat_input_text = get_urls(chat_input_text)

if urls and self.first_indexing_url_fn:
print("Detected URLs", urls)
file_ids = self.first_indexing_url_fn(
"\n".join(urls),
True,
settings,
user_id,
)
elif file_names:
for file_name in file_names:
file_id = first_selector_choices_map.get(file_name)
if file_id:
file_ids.append(file_id)

# add new file ids to the first selector choices
first_selector_choices.extend(zip(urls, file_ids))

# if file_ids is not empty and chat_input_text is empty
# set the input to summary
if not chat_input_text and file_ids:
chat_input_text = "Summary"

if file_ids:
selector_output = ["select", file_ids]
selector_output = [
"select",
gr.update(value=file_ids, choices=first_selector_choices),
]
else:
selector_output = [gr.update(), gr.update()]

Expand Down
4 changes: 2 additions & 2 deletions libs/ktem/ktem/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .conversation import get_file_names_regex
from .conversation import get_file_names_regex, get_urls
from .lang import SUPPORTED_LANGUAGE_MAP

__all__ = ["SUPPORTED_LANGUAGE_MAP", "get_file_names_regex"]
__all__ = ["SUPPORTED_LANGUAGE_MAP", "get_file_names_regex", "get_urls"]
10 changes: 10 additions & 0 deletions libs/ktem/ktem/utils/conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,15 @@ def get_file_names_regex(input_str: str) -> tuple[list[str], str]:
return matches, input_str


def get_urls(input_str: str) -> tuple[list[str], str]:
# get all urls in input_str
# also remove these urls from input_str
pattern = r"https?://[^\s]+"
matches = re.findall(pattern, input_str)
input_str = re.sub(pattern, "", input_str).strip()

return matches, input_str


if __name__ == "__main__":
print(sync_retrieval_n_message([[""], [""], [""]], []))
Loading