Skip to content

Commit 54320d0

Browse files
authored
feat(loader): implement markdown parsing in MathpixPDFReader (#498)
* ✨ feat(loader): implement markdown parsing in MathpixPDFReader Add functionality to properly handle PDF content: - Add parse_markdown_text_to_tables method to separate tables and text - Fix load_data implementation to properly process documents - Fix lazy_load_data method - Improve document metadata handling for tables and text sections The loader now correctly processes PDFs through Mathpix API and converts content to proper Document objects. * fix(loader): remove super() calls blocking MathpixPDFReader implementation Remove early returns using super() in load_data and lazy_load_data methods that were preventing the actual implementation from being executed. This fixes the "not implemented" error while maintaining the full PDF reader functionality.
1 parent b1e9f98 commit 54320d0

File tree

1 file changed

+180
-20
lines changed

1 file changed

+180
-20
lines changed

libs/kotaemon/kotaemon/loaders/mathpix_loader.py

Lines changed: 180 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@
22
import re
33
import time
44
from pathlib import Path
5-
from typing import Any, Dict, List, Optional
5+
from typing import Any, Dict, Generator, List, Optional, Union
66

77
import requests
88
from langchain.utils import get_from_dict_or_env
99
from llama_index.core.readers.base import BaseReader
1010

1111
from kotaemon.base import Document
1212

13-
from .utils.table import parse_markdown_text_to_tables, strip_special_chars_markdown
13+
from .utils.table import strip_special_chars_markdown
1414

1515

1616
# MathpixPDFLoader implementation taken largely from Daniel Gross's:
@@ -21,7 +21,7 @@ class MathpixPDFReader(BaseReader):
2121
def __init__(
2222
self,
2323
processed_file_format: str = "md",
24-
max_wait_time_seconds: int = 500,
24+
max_wait_time_seconds: int = 900,
2525
should_clean_pdf: bool = True,
2626
**kwargs: Any,
2727
) -> None:
@@ -87,22 +87,38 @@ def wait_for_processing(self, pdf_id: str) -> None:
8787
response = requests.get(url, headers=self._mathpix_headers)
8888
response_data = response.json()
8989
status = response_data.get("status", None)
90+
print(
91+
f"Processing status: {status},"
92+
f"Progress: {response_data.get('percent_done', 0)}%"
93+
)
9094

9195
if status == "completed":
9296
return
9397
elif status == "error":
94-
raise ValueError("Unable to retrieve PDF from Mathpix")
98+
raise ValueError(f"Mathpix processing error: {response_data}")
99+
elif status in [
100+
"split",
101+
"processing",
102+
]: # Add handling for processing states
103+
time.sleep(5)
104+
continue
95105
else:
96-
print(response_data)
97-
print(url)
106+
print(f"Unknown status: {response_data}")
98107
time.sleep(5)
99-
raise TimeoutError
108+
109+
raise TimeoutError(
110+
f"Processing did not complete within {self.max_wait_time_seconds} seconds"
111+
)
100112

101113
def get_processed_pdf(self, pdf_id: str) -> str:
102114
self.wait_for_processing(pdf_id)
103115
url = f"{self.url}/{pdf_id}.{self.processed_file_format}"
104116
response = requests.get(url, headers=self._mathpix_headers)
105-
return response.content.decode("utf-8")
117+
if response.status_code != 200:
118+
raise ValueError(f"Failed to get processed PDF: {response.text}")
119+
content = response.content.decode("utf-8")
120+
print(f"Retrieved content length: {len(content)}") # Debug print
121+
return content
106122

107123
def clean_pdf(self, contents: str) -> str:
108124
"""Clean the PDF file.
@@ -139,26 +155,79 @@ def clean_pdf(self, contents: str) -> str:
139155
contents = re.sub(markup_regex, "", contents)
140156
return contents
141157

158+
def parse_markdown_text_to_tables(
159+
self, content: str
160+
) -> tuple[list[tuple[int, str]], list[tuple[int, str]]]:
161+
"""Parse markdown text to get tables and texts separately.
162+
163+
Returns:
164+
Tuple of (tables, texts) where each is a list of (page_num, content) tuples
165+
"""
166+
print("Starting markdown parsing...")
167+
print(f"Content length: {len(content)}")
168+
169+
# Split by page markers if present
170+
pages = re.split(r"(?m)^# Page \d+\n", content)
171+
172+
tables: list[tuple[int, str]] = []
173+
texts: list[tuple[int, str]] = []
174+
175+
for page_num, page_content in enumerate(pages, 1):
176+
if not page_content.strip():
177+
continue
178+
179+
# Extract tables from the page
180+
table_matches = re.findall(r"(\|[^\n]+\|(?:\n\|[^\n]+\|)*)", page_content)
181+
if table_matches:
182+
for table in table_matches:
183+
tables.append(
184+
(page_num, table.strip())
185+
) # Store as tuple with page number
186+
# Remove tables from page content
187+
page_content = re.sub(
188+
r"(\|[^\n]+\|(?:\n\|[^\n]+\|)*)", "", page_content
189+
)
190+
191+
# Split remaining content into meaningful chunks
192+
chunks = re.split(r"\n\s*\n", page_content)
193+
for chunk in chunks:
194+
if chunk.strip():
195+
texts.append(
196+
(page_num, chunk.strip())
197+
) # Store as tuple with page number
198+
199+
print(f"Found {len(tables)} tables and {len(texts)} text sections")
200+
return tables, texts
201+
142202
def load_data(
143-
self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
203+
self,
204+
file: Union[str, List[str], Path],
205+
extra_info: Optional[Dict] = None,
206+
**load_kwargs: Any,
144207
) -> List[Document]:
145-
if "response_content" in kwargs:
146-
# overriding response content if specified
147-
content = kwargs["response_content"]
208+
"""Load data from file path."""
209+
file_path = Path(file) if isinstance(file, str) else file
210+
211+
if "response_content" in load_kwargs:
212+
content = load_kwargs["response_content"]
148213
else:
149-
# call original API
150214
pdf_id = self.send_pdf(file_path)
151215
content = self.get_processed_pdf(pdf_id)
152216

153217
if self.should_clean_pdf:
154218
content = self.clean_pdf(content)
155-
tables, texts = parse_markdown_text_to_tables(content)
219+
220+
tables, texts = self.parse_markdown_text_to_tables(content)
156221
documents = []
157-
for table in tables:
158-
text = strip_special_chars_markdown(table)
222+
223+
# Handle tables
224+
for page_num, table_content in tables:
225+
text = strip_special_chars_markdown(table_content)
159226
metadata = {
160-
"table_origin": table,
227+
"table_origin": table_content,
161228
"type": "table",
229+
"page_label": page_num,
230+
"page_number": page_num,
162231
}
163232
if extra_info:
164233
metadata.update(extra_info)
@@ -171,8 +240,99 @@ def load_data(
171240
)
172241
)
173242

174-
for text in texts:
175-
metadata = {"source": file_path.name, "type": "text"}
176-
documents.append(Document(text=text, metadata=metadata))
243+
# Handle text sections
244+
for page_num, text_content in texts:
245+
if not text_content.strip():
246+
continue
247+
metadata = {
248+
"source": str(file_path),
249+
"type": "text",
250+
"page_label": page_num,
251+
"page_number": page_num,
252+
}
253+
if extra_info:
254+
metadata.update(extra_info)
255+
documents.append(Document(text=text_content, metadata=metadata))
256+
257+
# Fallback if no content was parsed
258+
if not documents and content.strip():
259+
metadata = {
260+
"source": str(file_path),
261+
"type": "text",
262+
"page_label": 1,
263+
"page_number": 1,
264+
}
265+
if extra_info:
266+
metadata.update(extra_info)
267+
documents.append(Document(text=content.strip(), metadata=metadata))
177268

178269
return documents
270+
271+
def lazy_load_data(
272+
self,
273+
file: Union[str, List[str], Path],
274+
extra_info: Optional[Dict] = None,
275+
**load_kwargs: Any,
276+
) -> Generator[Document, None, None]:
277+
"""Lazy load data from file path."""
278+
file_path = Path(file) if isinstance(file, str) else file
279+
280+
if "response_content" in load_kwargs:
281+
content = load_kwargs["response_content"]
282+
else:
283+
pdf_id = self.send_pdf(file_path)
284+
print(f"PDF ID: {pdf_id}")
285+
content = self.get_processed_pdf(pdf_id)
286+
287+
if self.should_clean_pdf:
288+
content = self.clean_pdf(content)
289+
290+
tables, texts = self.parse_markdown_text_to_tables(content)
291+
292+
# Handle tables
293+
for page_num, table_content in tables: # Changed variable name for clarity
294+
text = strip_special_chars_markdown(table_content) # Pass just the content
295+
metadata = {
296+
"table_origin": table_content, # Use table_content here too
297+
"type": "table",
298+
"page_label": page_num,
299+
"page_number": page_num,
300+
}
301+
if extra_info:
302+
metadata.update(extra_info)
303+
yield Document(
304+
text=text,
305+
metadata=metadata,
306+
metadata_template="",
307+
metadata_seperator="",
308+
)
309+
310+
# Handle text sections
311+
for page_num, text_content in texts: # Changed variable name for clarity
312+
if not text_content.strip():
313+
continue
314+
metadata = {
315+
"source": str(file_path),
316+
"type": "text",
317+
"page_label": page_num,
318+
"page_number": page_num,
319+
}
320+
if extra_info:
321+
metadata.update(extra_info)
322+
yield Document(
323+
text=text_content, metadata=metadata
324+
) # Use text_content directly
325+
326+
# Fallback if no content was parsed
327+
if not (tables or texts) and content.strip():
328+
metadata = {
329+
"source": str(file_path),
330+
"type": "text",
331+
"page_label": 1,
332+
"page_number": 1,
333+
}
334+
if extra_info:
335+
metadata.update(extra_info)
336+
yield Document(text=content.strip(), metadata=metadata)
337+
338+
print(f"Completed processing PDF: {file_path}")

0 commit comments

Comments
 (0)