Skip to content

Commit ae56308

Browse files
committed
feat: update docling reader into extension manager
1 parent 609a1f0 commit ae56308

File tree

7 files changed

+105
-53
lines changed

7 files changed

+105
-53
lines changed

libs/kotaemon/kotaemon/indices/ingests/files.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22
from typing import Type
33

44
from decouple import config
5+
from ktem.extensions.extensions import extension_manager
56
from llama_index.core.readers.base import BaseReader
67
from llama_index.readers.file import PDFReader
78
from theflow.settings import settings as flowsettings
89

910
from kotaemon.base import BaseComponent, Document, Param
1011
from kotaemon.indices.extractors import BaseDocParser
11-
from kotaemon.indices.ingests.extensions import extension_manager
1212
from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
1313
from kotaemon.loaders import (
1414
AdobeReader,

libs/ktem/ktem/app.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,12 @@
77
from ktem.assets import PDFJS_PREBUILT_DIR, KotaemonTheme
88
from ktem.components import reasonings
99
from ktem.exceptions import HookAlreadyDeclared, HookNotDeclared
10+
from ktem.extensions.extensions import extension_manager
1011
from ktem.index import IndexManager
1112
from ktem.settings import BaseSettingGroup, SettingGroup, SettingReasoningGroup
1213
from theflow.settings import settings
1314
from theflow.utils.modules import import_dotted_string
1415

15-
from kotaemon.indices.ingests.extensions import extension_manager
16-
1716

1817
class BaseApp:
1918
"""The main app of Kotaemon

libs/ktem/ktem/extensions/__init__.py

Whitespace-only changes.

libs/kotaemon/kotaemon/indices/ingests/extensions.py renamed to libs/ktem/ktem/extensions/extensions.py

Lines changed: 91 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from copy import deepcopy
2+
from functools import cached_property
23
from typing import Any
34

45
from decouple import config
@@ -8,31 +9,77 @@
89
from kotaemon.loaders import (
910
AdobeReader,
1011
AzureAIDocumentIntelligenceLoader,
12+
DoclingReader,
1113
GOCR2ImageReader,
1214
HtmlReader,
1315
MhtmlReader,
1416
PandasExcelReader,
1517
PDFThumbnailReader,
1618
TxtReader,
1719
UnstructuredReader,
20+
WebReader,
1821
)
1922

20-
unstructured = UnstructuredReader()
21-
adobe_reader = AdobeReader()
22-
azure_reader = AzureAIDocumentIntelligenceLoader(
23-
endpoint=str(config("AZURE_DI_ENDPOINT", default="")),
24-
credential=str(config("AZURE_DI_CREDENTIAL", default="")),
25-
cache_dir=getattr(flowsettings, "KH_MARKDOWN_OUTPUT_DIR", None),
26-
)
27-
adobe_reader.vlm_endpoint = azure_reader.vlm_endpoint = getattr(
28-
flowsettings, "KH_VLM_ENDPOINT", ""
29-
)
23+
24+
class ReaderFactory:
25+
@cached_property
26+
def web(self) -> WebReader:
27+
return WebReader()
28+
29+
@cached_property
30+
def unstructured(self) -> UnstructuredReader:
31+
return UnstructuredReader()
32+
33+
@cached_property
34+
def adobe(self) -> AdobeReader:
35+
adobe_reader = AdobeReader()
36+
adobe_reader.vlm_endpoint = getattr(flowsettings, "KH_VLM_ENDPOINT", "")
37+
return adobe_reader
38+
39+
@cached_property
40+
def azuredi(self) -> AzureAIDocumentIntelligenceLoader:
41+
azuredi_reader = AzureAIDocumentIntelligenceLoader(
42+
endpoint=str(config("AZURE_DI_ENDPOINT", default="")),
43+
credential=str(config("AZURE_DI_CREDENTIAL", default="")),
44+
cache_dir=getattr(flowsettings, "KH_MARKDOWN_OUTPUT_DIR", None),
45+
)
46+
azuredi_reader.vlm_endpoint = getattr(flowsettings, "KH_VLM_ENDPOINT", "")
47+
return azuredi_reader
48+
49+
@cached_property
50+
def pandas_excel(self) -> PandasExcelReader:
51+
return PandasExcelReader()
52+
53+
@cached_property
54+
def html(self) -> HtmlReader:
55+
return HtmlReader()
56+
57+
@cached_property
58+
def mhtml(self) -> MhtmlReader:
59+
return MhtmlReader()
60+
61+
@cached_property
62+
def gocr(self) -> GOCR2ImageReader:
63+
return GOCR2ImageReader()
64+
65+
@cached_property
66+
def txt(self) -> TxtReader:
67+
return TxtReader()
68+
69+
@cached_property
70+
def docling(self) -> DoclingReader:
71+
return DoclingReader()
72+
73+
@cached_property
74+
def pdf_thumbnail(self) -> PDFThumbnailReader:
75+
return PDFThumbnailReader()
3076

3177

3278
class ExtensionManager:
3379
"""Pool of loaders for extensions"""
3480

35-
def __init__(self):
81+
def __init__(self, factory: ReaderFactory | None = None):
82+
self.factory = factory or ReaderFactory()
3683
self._supported, self._default_index = self._init_supported()
3784

3885
def get_current_loader(self) -> dict[str, BaseReader]:
@@ -43,26 +90,40 @@ def get_current_loader(self) -> dict[str, BaseReader]:
4390
}
4491
)
4592

46-
@staticmethod
47-
def _init_supported() -> tuple[dict[str, list[BaseReader]], dict[str, str]]:
48-
gocr = GOCR2ImageReader()
49-
93+
def _init_supported(self) -> tuple[dict[str, list[BaseReader]], dict[str, str]]:
5094
supported: dict[str, list[BaseReader]] = {
51-
".xlsx": [PandasExcelReader()],
52-
".docx": [unstructured],
53-
".pptx": [unstructured],
54-
".xls": [unstructured],
55-
".doc": [unstructured],
56-
".html": [HtmlReader()],
57-
".mhtml": [MhtmlReader()],
58-
".png": [unstructured, gocr],
59-
".jpeg": [unstructured, gocr],
60-
".jpg": [unstructured, gocr],
61-
".tiff": [unstructured],
62-
".tif": [unstructured],
63-
".pdf": [PDFThumbnailReader(), adobe_reader, azure_reader],
64-
".txt": [TxtReader()],
65-
".md": [TxtReader()],
95+
".xlsx": [self.factory.pandas_excel],
96+
".docx": [self.factory.unstructured],
97+
".pptx": [self.factory.unstructured],
98+
".xls": [self.factory.unstructured],
99+
".doc": [self.factory.unstructured],
100+
".html": [self.factory.html],
101+
".mhtml": [self.factory.mhtml],
102+
".png": [
103+
self.factory.unstructured,
104+
self.factory.gocr,
105+
self.factory.docling,
106+
],
107+
".jpeg": [
108+
self.factory.unstructured,
109+
self.factory.gocr,
110+
self.factory.docling,
111+
],
112+
".jpg": [
113+
self.factory.unstructured,
114+
self.factory.gocr,
115+
self.factory.docling,
116+
],
117+
".tiff": [self.factory.unstructured, self.factory.docling],
118+
".tif": [self.factory.unstructured, self.factory.docling],
119+
".pdf": [
120+
self.factory.pdf_thumbnail,
121+
self.factory.adobe,
122+
self.factory.azuredi,
123+
self.factory.docling,
124+
],
125+
".txt": [self.factory.txt],
126+
".md": [self.factory.txt],
66127
}
67128

68129
default_index = {
@@ -136,7 +197,3 @@ def generate_gradio_settings(self) -> dict[str, Any]:
136197

137198

138199
extension_manager = ExtensionManager()
139-
140-
141-
if __name__ == "__main__":
142-
print(extension_manager.get_loaders_by_extension(".xlsx"))

libs/ktem/ktem/index/file/pipelines.py

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import tiktoken
1616
from ktem.db.models import engine
1717
from ktem.embeddings.manager import embedding_models_manager
18+
from ktem.extensions.extensions import extension_manager
1819
from ktem.llms.manager import llms
1920
from ktem.rerankings.manager import reranking_models_manager
2021
from llama_index.core.readers.base import BaseReader
@@ -34,14 +35,10 @@
3435
from kotaemon.base import BaseComponent, Document, Node, Param, RetrievedDocument
3536
from kotaemon.embeddings import BaseEmbeddings
3637
from kotaemon.indices import VectorIndexing, VectorRetrieval
37-
from kotaemon.indices.ingests.extensions import extension_manager
38-
from kotaemon.indices.ingests.files import ( # KH_DEFAULT_FILE_EXTRACTORS,
39-
adobe_reader,
40-
azure_reader,
41-
docling_reader,
42-
unstructured,
43-
web_reader,
44-
)
38+
39+
# from kotaemon.indices.ingests.files import ( # KH_DEFAULT_FILE_EXTRACTORS,
40+
# web_reader,
41+
# )
4542
from kotaemon.indices.rankings import BaseReranking, LLMReranking, LLMTrulensScoring
4643
from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
4744

@@ -674,11 +671,11 @@ def readers(self):
674671
readers: dict[str, BaseReader] = extension_manager.get_current_loader()
675672
print("reader_mode", self.reader_mode)
676673
if self.reader_mode == "adobe":
677-
readers[".pdf"] = adobe_reader
674+
readers[".pdf"] = extension_manager.factory.adobe
678675
elif self.reader_mode == "azure-di":
679-
readers[".pdf"] = azure_reader
676+
readers[".pdf"] = extension_manager.factory.azuredi
680677
elif self.reader_mode == "docling":
681-
readers[".pdf"] = docling_reader
678+
readers[".pdf"] = extension_manager.factory.docling
682679

683680
dev_readers, _, _ = dev_settings()
684681
readers.update(dev_readers)
@@ -737,11 +734,11 @@ def route(self, file_path: str | Path) -> IndexPipeline:
737734

738735
# check if file_path is a URL
739736
if self.is_url(file_path):
740-
reader = web_reader
737+
reader = extension_manager.factory.web
741738
else:
742739
assert isinstance(file_path, Path)
743740
ext = file_path.suffix.lower()
744-
reader = self.readers.get(ext, unstructured)
741+
reader = self.readers.get(ext, extension_manager.factory.unstructured)
745742
if reader is None:
746743
raise NotImplementedError(
747744
f"No supported pipeline to index {file_path.name}. Please specify "

libs/ktem/ktem/pages/chat/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from ktem.app import BasePage
99
from ktem.components import reasonings
1010
from ktem.db.models import Conversation, engine
11+
from ktem.extensions.extensions import extension_manager
1112
from ktem.index.file.ui import File
1213
from ktem.reasoning.prompt_optimization.suggest_conversation_name import (
1314
SuggestConvNamePipeline,
@@ -20,7 +21,6 @@
2021
from theflow.settings import settings as flowsettings
2122

2223
from kotaemon.base import Document
23-
from kotaemon.indices.ingests.extensions import extension_manager
2424

2525
from ...utils import SUPPORTED_LANGUAGE_MAP, get_file_names_regex
2626
from .chat_panel import ChatPanel

libs/ktem/ktem/pages/settings.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,9 @@
44
from ktem.app import BasePage
55
from ktem.components import reasonings
66
from ktem.db.models import Settings, User, engine
7+
from ktem.extensions.extensions import extension_manager
78
from sqlmodel import Session, select
89

9-
from kotaemon.indices.ingests.extensions import extension_manager
10-
1110
signout_js = """
1211
function(u, c, pw, pwc) {
1312
removeFromStorage('username');

0 commit comments

Comments
 (0)