Skip to content

Commit 7a02cb7

Browse files
authored
feat: add URL indexing directly from chat input (#571) bump:patch
* feat: enable lightrag by default and add graphrag key check * feat: add URL indexing from chatbox
1 parent a0c9a6e commit 7a02cb7

File tree

8 files changed

+71
-12
lines changed

8 files changed

+71
-12
lines changed

flowsettings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@
296296
}
297297

298298
USE_NANO_GRAPHRAG = config("USE_NANO_GRAPHRAG", default=False, cast=bool)
299-
USE_LIGHTRAG = config("USE_LIGHTRAG", default=False, cast=bool)
299+
USE_LIGHTRAG = config("USE_LIGHTRAG", default=True, cast=bool)
300300

301301
GRAPHRAG_INDEX_TYPES = ["ktem.index.file.graph.GraphRAGIndex"]
302302

libs/ktem/ktem/index/file/graph/lightrag_pipelines.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
filestorage_path = Path(settings.KH_FILESTORAGE_PATH) / "lightrag"
4646
filestorage_path.mkdir(parents=True, exist_ok=True)
4747

48-
INDEX_BATCHSIZE = 2
48+
INDEX_BATCHSIZE = 4
4949

5050

5151
def get_llm_func(model):
@@ -268,7 +268,9 @@ def call_graphrag_index(self, graph_id: str, docs: list[Document]):
268268

269269
for doc_id in range(0, len(all_docs), INDEX_BATCHSIZE):
270270
cur_docs = all_docs[doc_id : doc_id + INDEX_BATCHSIZE]
271-
graphrag_func.insert(cur_docs)
271+
combined_doc = "\n".join(cur_docs)
272+
273+
graphrag_func.insert(combined_doc)
272274
process_doc_count += len(cur_docs)
273275
yield Document(
274276
channel="debug",

libs/ktem/ktem/index/file/graph/nano_pipelines.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,9 @@ def call_graphrag_index(self, graph_id: str, docs: list[Document]):
263263
)
264264
for doc_id in range(0, len(all_docs), INDEX_BATCHSIZE):
265265
cur_docs = all_docs[doc_id : doc_id + INDEX_BATCHSIZE]
266-
graphrag_func.insert(cur_docs)
266+
combined_doc = "\n".join(cur_docs)
267+
268+
graphrag_func.insert(combined_doc)
267269
process_doc_count += len(cur_docs)
268270
yield Document(
269271
channel="debug",

libs/ktem/ktem/index/file/graph/pipelines.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,14 @@
4747
filestorage_path = Path(settings.KH_FILESTORAGE_PATH) / "graphrag"
4848
filestorage_path.mkdir(parents=True, exist_ok=True)
4949

50+
GRAPHRAG_KEY_MISSING_MESSAGE = (
51+
"GRAPHRAG_API_KEY is not set. Please set it to use the GraphRAG retriever pipeline."
52+
)
53+
54+
55+
def check_graphrag_api_key():
56+
return len(os.getenv("GRAPHRAG_API_KEY", "")) > 0
57+
5058

5159
def prepare_graph_index_path(graph_id: str):
5260
root_path = Path(filestorage_path) / graph_id
@@ -99,6 +107,9 @@ def write_docs_to_files(self, graph_id: str, docs: list[Document]):
99107
return root_path
100108

101109
def call_graphrag_index(self, graph_id: str, all_docs: list[Document]):
110+
if not check_graphrag_api_key():
111+
raise ValueError(GRAPHRAG_KEY_MISSING_MESSAGE)
112+
102113
# call GraphRAG index with docs and graph_id
103114
input_path = self.write_docs_to_files(graph_id, all_docs)
104115
input_path = str(input_path.absolute())
@@ -346,6 +357,10 @@ def run(
346357
) -> list[RetrievedDocument]:
347358
if not self.file_ids:
348359
return []
360+
361+
if not check_graphrag_api_key():
362+
raise ValueError(GRAPHRAG_KEY_MISSING_MESSAGE)
363+
349364
context_builder = self._build_graph_search()
350365

351366
local_context_params = {

libs/ktem/ktem/index/file/ui.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,11 @@ def on_register_events(self):
683683
if self._index.id == 1:
684684
self.quick_upload_state = gr.State(value=[])
685685
print("Setting up quick upload event")
686+
687+
# override indexing function from chat page
688+
self._app.chat_page.first_indexing_url_fn = (
689+
self.index_fn_url_with_default_loaders
690+
)
686691
quickUploadedEvent = (
687692
self._app.chat_page.quick_file_upload.upload(
688693
fn=lambda: gr.update(

libs/ktem/ktem/pages/chat/__init__.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from kotaemon.base import Document
2323
from kotaemon.indices.ingests.files import KH_DEFAULT_FILE_EXTRACTORS
2424

25-
from ...utils import SUPPORTED_LANGUAGE_MAP, get_file_names_regex
25+
from ...utils import SUPPORTED_LANGUAGE_MAP, get_file_names_regex, get_urls
2626
from .chat_panel import ChatPanel
2727
from .common import STATE
2828
from .control import ConversationControl
@@ -140,6 +140,7 @@ def on_building_ui(self):
140140
# get the file selector choices for the first index
141141
if index_id == 0:
142142
self.first_selector_choices = index_ui.selector_choices
143+
self.first_indexing_url_fn = None
143144

144145
if gr_index:
145146
if isinstance(gr_index, list):
@@ -284,6 +285,7 @@ def on_register_events(self):
284285
self.chat_panel.text_input,
285286
self.chat_panel.chatbot,
286287
self._app.user_id,
288+
self._app.settings_state,
287289
self.chat_control.conversation_id,
288290
self.chat_control.conversation_rn,
289291
self.first_selector_choices,
@@ -634,6 +636,7 @@ def submit_msg(
634636
chat_input,
635637
chat_history,
636638
user_id,
639+
settings,
637640
conv_id,
638641
conv_name,
639642
first_selector_choices,
@@ -643,22 +646,44 @@ def submit_msg(
643646
raise ValueError("Input is empty")
644647

645648
chat_input_text = chat_input.get("text", "")
649+
file_ids = []
646650

647-
# get all file names with pattern @"filename" in input_str
648-
file_names, chat_input_text = get_file_names_regex(chat_input_text)
649651
first_selector_choices_map = {
650652
item[0]: item[1] for item in first_selector_choices
651653
}
652-
file_ids = []
653654

654-
if file_names:
655+
# get all file names with pattern @"filename" in input_str
656+
file_names, chat_input_text = get_file_names_regex(chat_input_text)
657+
# get all urls in input_str
658+
urls, chat_input_text = get_urls(chat_input_text)
659+
660+
if urls and self.first_indexing_url_fn:
661+
print("Detected URLs", urls)
662+
file_ids = self.first_indexing_url_fn(
663+
"\n".join(urls),
664+
True,
665+
settings,
666+
user_id,
667+
)
668+
elif file_names:
655669
for file_name in file_names:
656670
file_id = first_selector_choices_map.get(file_name)
657671
if file_id:
658672
file_ids.append(file_id)
659673

674+
# add new file ids to the first selector choices
675+
first_selector_choices.extend(zip(urls, file_ids))
676+
677+
# if file_ids is not empty and chat_input_text is empty
678+
# set the input to summary
679+
if not chat_input_text and file_ids:
680+
chat_input_text = "Summary"
681+
660682
if file_ids:
661-
selector_output = ["select", file_ids]
683+
selector_output = [
684+
"select",
685+
gr.update(value=file_ids, choices=first_selector_choices),
686+
]
662687
else:
663688
selector_output = [gr.update(), gr.update()]
664689

libs/ktem/ktem/utils/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from .conversation import get_file_names_regex
1+
from .conversation import get_file_names_regex, get_urls
22
from .lang import SUPPORTED_LANGUAGE_MAP
33

4-
__all__ = ["SUPPORTED_LANGUAGE_MAP", "get_file_names_regex"]
4+
__all__ = ["SUPPORTED_LANGUAGE_MAP", "get_file_names_regex", "get_urls"]

libs/ktem/ktem/utils/conversation.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,5 +29,15 @@ def get_file_names_regex(input_str: str) -> tuple[list[str], str]:
2929
return matches, input_str
3030

3131

32+
def get_urls(input_str: str) -> tuple[list[str], str]:
33+
# get all urls in input_str
34+
# also remove these urls from input_str
35+
pattern = r"https?://[^\s]+"
36+
matches = re.findall(pattern, input_str)
37+
input_str = re.sub(pattern, "", input_str).strip()
38+
39+
return matches, input_str
40+
41+
3242
if __name__ == "__main__":
3343
print(sync_retrieval_n_message([[""], [""], [""]], []))

0 commit comments

Comments
 (0)