Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,17 @@ documents and developers who want to build their own RAG pipeline.

See [Local model setup](docs/local_model.md).

### Setup multimodal document parsing (OCR, table parsing, figure extraction)

These options are available:

- [Azure Document Intelligence (API)](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence)
- [Adobe PDF Extract (API)](https://developer.adobe.com/document-services/docs/overview/pdf-extract-api/)
- [Docling (local, open-source)](https://github.com/DS4SD/docling)
- To use Docling, first install required dependencies: `pip install docling`

Select corresponding loaders in `Settings -> Retrieval Settings -> File loader`

### Customize your application

- By default, all application data is stored in the `./ktem_app_data` folder. You can back up or copy this folder to transfer your installation to a new machine.
Expand Down
8 changes: 5 additions & 3 deletions libs/kotaemon/kotaemon/indices/ingests/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
AdobeReader,
AzureAIDocumentIntelligenceLoader,
DirectoryReader,
DoclingReader,
HtmlReader,
MathpixPDFReader,
MhtmlReader,
Expand All @@ -32,9 +33,10 @@
credential=str(config("AZURE_DI_CREDENTIAL", default="")),
cache_dir=getattr(flowsettings, "KH_MARKDOWN_OUTPUT_DIR", None),
)
adobe_reader.vlm_endpoint = azure_reader.vlm_endpoint = getattr(
flowsettings, "KH_VLM_ENDPOINT", ""
)
docling_reader = DoclingReader()
adobe_reader.vlm_endpoint = (
azure_reader.vlm_endpoint
) = docling_reader.vlm_endpoint = getattr(flowsettings, "KH_VLM_ENDPOINT", "")


KH_DEFAULT_FILE_EXTRACTORS: dict[str, BaseReader] = {
Expand Down
2 changes: 2 additions & 0 deletions libs/kotaemon/kotaemon/loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from .azureai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader
from .base import AutoReader, BaseReader
from .composite_loader import DirectoryReader
from .docling_loader import DoclingReader
from .docx_loader import DocxReader
from .excel_loader import ExcelReader, PandasExcelReader
from .html_loader import HtmlReader, MhtmlReader
Expand Down Expand Up @@ -30,4 +31,5 @@
"TxtReader",
"PDFThumbnailReader",
"WebReader",
"DoclingReader",
]
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ def crop_image(file_path: Path, bbox: list[float], page_number: int = 0) -> Imag
"""
left, upper, right, lower = bbox

left, right = min(left, right), max(left, right)
upper, lower = min(upper, lower), max(upper, lower)

img: Image.Image
suffix = file_path.suffix.lower()
if suffix == ".pdf":
Expand Down
232 changes: 232 additions & 0 deletions libs/kotaemon/kotaemon/loaders/docling_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
import base64
from collections import defaultdict
from io import BytesIO
from pathlib import Path
from typing import List, Optional

from kotaemon.base import Document, Param

from .azureai_document_intelligence_loader import crop_image
from .base import BaseReader
from .utils.adobe import generate_single_figure_caption, make_markdown_table


class DoclingReader(BaseReader):
"""Using Docling to extract document structure and content"""

_dependencies = ["docling"]

vlm_endpoint: str = Param(
help=(
"Default VLM endpoint for figure captioning. "
"If not provided, will not caption the figures"
)
)

max_figure_to_caption: int = Param(
100,
help=(
"The maximum number of figures to caption. "
"The rest will be indexed without captions."
),
)

figure_friendly_filetypes: list[str] = Param(
[".pdf", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif", ".tif"],
help=(
"File types that we can reliably open and extract figures. "
"For files like .docx or .html, the visual layout may be different "
"when viewed from different tools, hence we cannot use Azure DI location "
"to extract figures."
),
)

@Param.auto(cache=True)
def converter_(self):
try:
from docling.document_converter import DocumentConverter
except ImportError:
raise ImportError("Please install docling: 'pip install docling'")

return DocumentConverter()

def run(
self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
) -> List[Document]:
return self.load_data(file_path, extra_info, **kwargs)

def load_data(
self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
) -> List[Document]:
"""Extract the input file, allowing multi-modal extraction"""

metadata = extra_info or {}

result = self.converter_.convert(file_path)
result_dict = result.document.export_to_dict()

file_path = Path(file_path)
file_name = file_path.name

# extract the figures
figures = []
gen_caption_count = 0
for figure_obj in result_dict.get("pictures", []):
if not self.vlm_endpoint:
continue
if file_path.suffix.lower() not in self.figure_friendly_filetypes:
continue

# retrieve extractive captions provided by docling
caption_refs = [caption["$ref"] for caption in figure_obj["captions"]]
extractive_captions = []
for caption_ref in caption_refs:
text_id = caption_ref.split("/")[-1]
try:
caption_text = result_dict["texts"][int(text_id)]["text"]
extractive_captions.append(caption_text)
except (ValueError, TypeError, IndexError) as e:
print(e)
continue

# read & crop image
page_number = figure_obj["prov"][0]["page_no"]

try:
page_number_text = str(page_number)
page_width = result_dict["pages"][page_number_text]["size"]["width"]
page_height = result_dict["pages"][page_number_text]["size"]["height"]

bbox_obj = figure_obj["prov"][0]["bbox"]
bbox: list[float] = [
bbox_obj["l"],
bbox_obj["t"],
bbox_obj["r"],
bbox_obj["b"],
]
if bbox_obj["coord_origin"] == "BOTTOMLEFT":
bbox = self._convert_bbox_bl_tl(bbox, page_width, page_height)

img = crop_image(file_path, bbox, page_number - 1)
except KeyError as e:
print(e, list(result_dict["pages"].keys()))
continue

# convert img to base64
img_bytes = BytesIO()
img.save(img_bytes, format="PNG")
img_base64 = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
img_base64 = f"data:image/png;base64,{img_base64}"

# generate the generative caption
if gen_caption_count >= self.max_figure_to_caption:
gen_caption = ""
else:
gen_caption_count += 1
gen_caption = generate_single_figure_caption(
img_base64, self.vlm_endpoint
)

# join the extractive and generative captions
caption = "\n".join(extractive_captions + [gen_caption])

# store the image into document
figure_metadata = {
"image_origin": img_base64,
"type": "image",
"page_label": page_number,
"file_name": file_name,
"file_path": file_path,
}
figure_metadata.update(metadata)

figures.append(
Document(
text=caption,
metadata=figure_metadata,
)
)

# extract the tables
tables = []
for table_obj in result_dict.get("tables", []):
# convert the tables into markdown format
markdown_table = self._parse_table(table_obj)
caption_refs = [caption["$ref"] for caption in table_obj["captions"]]

extractive_captions = []
for caption_ref in caption_refs:
text_id = caption_ref.split("/")[-1]
try:
caption_text = result_dict["texts"][int(text_id)]["text"]
extractive_captions.append(caption_text)
except (ValueError, TypeError, IndexError) as e:
print(e)
continue
# join the extractive and generative captions
caption = "\n".join(extractive_captions)
markdown_table = f"{caption}\n{markdown_table}"

page_number = table_obj["prov"][0].get("page_no", 1)

table_metadata = {
"type": "table",
"page_label": page_number,
"table_origin": markdown_table,
"file_name": file_name,
"file_path": file_path,
}
table_metadata.update(metadata)

tables.append(
Document(
text=markdown_table,
metadata=table_metadata,
)
)

# join plain text elements
texts = []
page_number_to_text = defaultdict(list)

for text_obj in result_dict["texts"]:
page_number = text_obj["prov"][0].get("page_no", 1)
page_number_to_text[page_number].append(text_obj["text"])

for page_number, txts in page_number_to_text.items():
texts.append(
Document(
text="\n".join(txts),
metadata={
"page_label": page_number,
"file_name": file_name,
"file_path": file_path,
**metadata,
},
)
)

return texts + tables + figures

def _convert_bbox_bl_tl(
self, bbox: list[float], page_width: int, page_height: int
) -> list[float]:
"""Convert bbox from bottom-left to top-left"""
x0, y0, x1, y1 = bbox
return [
x0 / page_width,
(page_height - y1) / page_height,
x1 / page_width,
(page_height - y0) / page_height,
]

def _parse_table(self, table_obj: dict) -> str:
"""Convert docling table object to markdown table"""
table_as_list: List[List[str]] = []
grid = table_obj["data"]["grid"]
for row in grid:
table_as_list.append([])
for cell in row:
table_as_list[-1].append(cell["text"])

return make_markdown_table(table_as_list)
24 changes: 14 additions & 10 deletions libs/kotaemon/kotaemon/loaders/utils/adobe.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def request_adobe_service(file_path: str, output_path: str = "") -> str:
return output_path


def make_markdown_table(table_as_list: List[str]) -> str:
def make_markdown_table(table_as_list: List[List[str]]) -> str:
"""
Convert table from python list representation to markdown format.
The input list consists of rows of tables, the first row is the header.
Expand Down Expand Up @@ -203,17 +203,21 @@ def parse_figure_paths(file_paths: List[Path]) -> Union[bytes, str]:


def generate_single_figure_caption(vlm_endpoint: str, figure: str) -> str:
output = ""

"""Summarize a single figure using GPT-4V"""
if figure:
output = generate_gpt4v(
endpoint=vlm_endpoint,
prompt="Provide a short 2 sentence summary of this image?",
images=figure,
)
if "sorry" in output.lower():
output = ""
else:
output = ""
try:
output = generate_gpt4v(
endpoint=vlm_endpoint,
prompt="Provide a short 2 sentence summary of this image?",
images=figure,
)
if "sorry" in output.lower():
output = ""
except Exception as e:
print(f"Error generating caption: {e}")

return output


Expand Down
4 changes: 4 additions & 0 deletions libs/ktem/ktem/index/file/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
KH_DEFAULT_FILE_EXTRACTORS,
adobe_reader,
azure_reader,
docling_reader,
unstructured,
web_reader,
)
Expand Down Expand Up @@ -673,6 +674,8 @@ def readers(self):
readers[".pdf"] = adobe_reader
elif self.reader_mode == "azure-di":
readers[".pdf"] = azure_reader
elif self.reader_mode == "docling":
readers[".pdf"] = docling_reader

dev_readers, _, _ = dev_settings()
readers.update(dev_readers)
Expand All @@ -692,6 +695,7 @@ def get_user_settings(cls):
"Azure AI Document Intelligence (figure+table extraction)",
"azure-di",
),
("Docling", "docling"),
],
"component": "dropdown",
},
Expand Down
Loading