11from copy import deepcopy
2+ from functools import cached_property
23from typing import Any
34
45from decouple import config
89from kotaemon .loaders import (
910 AdobeReader ,
1011 AzureAIDocumentIntelligenceLoader ,
12+ DoclingReader ,
1113 GOCR2ImageReader ,
1214 HtmlReader ,
1315 MhtmlReader ,
1416 PandasExcelReader ,
1517 PDFThumbnailReader ,
1618 TxtReader ,
1719 UnstructuredReader ,
20+ WebReader ,
1821)
1922
20- unstructured = UnstructuredReader ()
21- adobe_reader = AdobeReader ()
22- azure_reader = AzureAIDocumentIntelligenceLoader (
23- endpoint = str (config ("AZURE_DI_ENDPOINT" , default = "" )),
24- credential = str (config ("AZURE_DI_CREDENTIAL" , default = "" )),
25- cache_dir = getattr (flowsettings , "KH_MARKDOWN_OUTPUT_DIR" , None ),
26- )
27- adobe_reader .vlm_endpoint = azure_reader .vlm_endpoint = getattr (
28- flowsettings , "KH_VLM_ENDPOINT" , ""
29- )
23+
24+ class ReaderFactory :
25+ @cached_property
26+ def web (self ) -> WebReader :
27+ return WebReader ()
28+
29+ @cached_property
30+ def unstructured (self ) -> UnstructuredReader :
31+ return UnstructuredReader ()
32+
33+ @cached_property
34+ def adobe (self ) -> AdobeReader :
35+ adobe_reader = AdobeReader ()
36+ adobe_reader .vlm_endpoint = getattr (flowsettings , "KH_VLM_ENDPOINT" , "" )
37+ return adobe_reader
38+
39+ @cached_property
40+ def azuredi (self ) -> AzureAIDocumentIntelligenceLoader :
41+ azuredi_reader = AzureAIDocumentIntelligenceLoader (
42+ endpoint = str (config ("AZURE_DI_ENDPOINT" , default = "" )),
43+ credential = str (config ("AZURE_DI_CREDENTIAL" , default = "" )),
44+ cache_dir = getattr (flowsettings , "KH_MARKDOWN_OUTPUT_DIR" , None ),
45+ )
46+ azuredi_reader .vlm_endpoint = getattr (flowsettings , "KH_VLM_ENDPOINT" , "" )
47+ return azuredi_reader
48+
49+ @cached_property
50+ def pandas_excel (self ) -> PandasExcelReader :
51+ return PandasExcelReader ()
52+
53+ @cached_property
54+ def html (self ) -> HtmlReader :
55+ return HtmlReader ()
56+
57+ @cached_property
58+ def mhtml (self ) -> MhtmlReader :
59+ return MhtmlReader ()
60+
61+ @cached_property
62+ def gocr (self ) -> GOCR2ImageReader :
63+ return GOCR2ImageReader ()
64+
65+ @cached_property
66+ def txt (self ) -> TxtReader :
67+ return TxtReader ()
68+
69+ @cached_property
70+ def docling (self ) -> DoclingReader :
71+ return DoclingReader ()
72+
73+ @cached_property
74+ def pdf_thumbnail (self ) -> PDFThumbnailReader :
75+ return PDFThumbnailReader ()
3076
3177
3278class ExtensionManager :
3379 """Pool of loaders for extensions"""
3480
35- def __init__ (self ):
81+ def __init__ (self , factory : ReaderFactory | None = None ):
82+ self .factory = factory or ReaderFactory ()
3683 self ._supported , self ._default_index = self ._init_supported ()
3784
3885 def get_current_loader (self ) -> dict [str , BaseReader ]:
@@ -43,26 +90,40 @@ def get_current_loader(self) -> dict[str, BaseReader]:
4390 }
4491 )
4592
46- @staticmethod
47- def _init_supported () -> tuple [dict [str , list [BaseReader ]], dict [str , str ]]:
48- gocr = GOCR2ImageReader ()
49-
93+ def _init_supported (self ) -> tuple [dict [str , list [BaseReader ]], dict [str , str ]]:
5094 supported : dict [str , list [BaseReader ]] = {
51- ".xlsx" : [PandasExcelReader ()],
52- ".docx" : [unstructured ],
53- ".pptx" : [unstructured ],
54- ".xls" : [unstructured ],
55- ".doc" : [unstructured ],
56- ".html" : [HtmlReader ()],
57- ".mhtml" : [MhtmlReader ()],
58- ".png" : [unstructured , gocr ],
59- ".jpeg" : [unstructured , gocr ],
60- ".jpg" : [unstructured , gocr ],
61- ".tiff" : [unstructured ],
62- ".tif" : [unstructured ],
63- ".pdf" : [PDFThumbnailReader (), adobe_reader , azure_reader ],
64- ".txt" : [TxtReader ()],
65- ".md" : [TxtReader ()],
95+ ".xlsx" : [self .factory .pandas_excel ],
96+ ".docx" : [self .factory .unstructured ],
97+ ".pptx" : [self .factory .unstructured ],
98+ ".xls" : [self .factory .unstructured ],
99+ ".doc" : [self .factory .unstructured ],
100+ ".html" : [self .factory .html ],
101+ ".mhtml" : [self .factory .mhtml ],
102+ ".png" : [
103+ self .factory .unstructured ,
104+ self .factory .gocr ,
105+ self .factory .docling ,
106+ ],
107+ ".jpeg" : [
108+ self .factory .unstructured ,
109+ self .factory .gocr ,
110+ self .factory .docling ,
111+ ],
112+ ".jpg" : [
113+ self .factory .unstructured ,
114+ self .factory .gocr ,
115+ self .factory .docling ,
116+ ],
117+ ".tiff" : [self .factory .unstructured , self .factory .docling ],
118+ ".tif" : [self .factory .unstructured , self .factory .docling ],
119+ ".pdf" : [
120+ self .factory .pdf_thumbnail ,
121+ self .factory .adobe ,
122+ self .factory .azuredi ,
123+ self .factory .docling ,
124+ ],
125+ ".txt" : [self .factory .txt ],
126+ ".md" : [self .factory .txt ],
66127 }
67128
68129 default_index = {
@@ -136,7 +197,3 @@ def generate_gradio_settings(self) -> dict[str, Any]:
136197
137198
138199extension_manager = ExtensionManager ()
139-
140-
141- if __name__ == "__main__" :
142- print (extension_manager .get_loaders_by_extension (".xlsx" ))
0 commit comments