22import re
33import time
44from pathlib import Path
5- from typing import Any , Dict , List , Optional
5+ from typing import Any , Dict , Generator , List , Optional , Union
66
77import requests
88from langchain .utils import get_from_dict_or_env
99from llama_index .core .readers .base import BaseReader
1010
1111from kotaemon .base import Document
1212
13- from .utils .table import parse_markdown_text_to_tables , strip_special_chars_markdown
13+ from .utils .table import strip_special_chars_markdown
1414
1515
1616# MathpixPDFLoader implementation taken largely from Daniel Gross's:
@@ -21,7 +21,7 @@ class MathpixPDFReader(BaseReader):
2121 def __init__ (
2222 self ,
2323 processed_file_format : str = "md" ,
24- max_wait_time_seconds : int = 500 ,
24+ max_wait_time_seconds : int = 900 ,
2525 should_clean_pdf : bool = True ,
2626 ** kwargs : Any ,
2727 ) -> None :
@@ -87,22 +87,38 @@ def wait_for_processing(self, pdf_id: str) -> None:
8787 response = requests .get (url , headers = self ._mathpix_headers )
8888 response_data = response .json ()
8989 status = response_data .get ("status" , None )
90+ print (
91+ f"Processing status: { status } ,"
92+ f"Progress: { response_data .get ('percent_done' , 0 )} %"
93+ )
9094
9195 if status == "completed" :
9296 return
9397 elif status == "error" :
94- raise ValueError ("Unable to retrieve PDF from Mathpix" )
98+ raise ValueError (f"Mathpix processing error: { response_data } " )
99+ elif status in [
100+ "split" ,
101+ "processing" ,
102+ ]: # Add handling for processing states
103+ time .sleep (5 )
104+ continue
95105 else :
96- print (response_data )
97- print (url )
106+ print (f"Unknown status: { response_data } " )
98107 time .sleep (5 )
99- raise TimeoutError
108+
109+ raise TimeoutError (
110+ f"Processing did not complete within { self .max_wait_time_seconds } seconds"
111+ )
100112
101113 def get_processed_pdf (self , pdf_id : str ) -> str :
102114 self .wait_for_processing (pdf_id )
103115 url = f"{ self .url } /{ pdf_id } .{ self .processed_file_format } "
104116 response = requests .get (url , headers = self ._mathpix_headers )
105- return response .content .decode ("utf-8" )
117+ if response .status_code != 200 :
118+ raise ValueError (f"Failed to get processed PDF: { response .text } " )
119+ content = response .content .decode ("utf-8" )
120+ print (f"Retrieved content length: { len (content )} " ) # Debug print
121+ return content
106122
107123 def clean_pdf (self , contents : str ) -> str :
108124 """Clean the PDF file.
@@ -139,26 +155,79 @@ def clean_pdf(self, contents: str) -> str:
139155 contents = re .sub (markup_regex , "" , contents )
140156 return contents
141157
158+ def parse_markdown_text_to_tables (
159+ self , content : str
160+ ) -> tuple [list [tuple [int , str ]], list [tuple [int , str ]]]:
161+ """Parse markdown text to get tables and texts separately.
162+
163+ Returns:
164+ Tuple of (tables, texts) where each is a list of (page_num, content) tuples
165+ """
166+ print ("Starting markdown parsing..." )
167+ print (f"Content length: { len (content )} " )
168+
169+ # Split by page markers if present
170+ pages = re .split (r"(?m)^# Page \d+\n" , content )
171+
172+ tables : list [tuple [int , str ]] = []
173+ texts : list [tuple [int , str ]] = []
174+
175+ for page_num , page_content in enumerate (pages , 1 ):
176+ if not page_content .strip ():
177+ continue
178+
179+ # Extract tables from the page
180+ table_matches = re .findall (r"(\|[^\n]+\|(?:\n\|[^\n]+\|)*)" , page_content )
181+ if table_matches :
182+ for table in table_matches :
183+ tables .append (
184+ (page_num , table .strip ())
185+ ) # Store as tuple with page number
186+ # Remove tables from page content
187+ page_content = re .sub (
188+ r"(\|[^\n]+\|(?:\n\|[^\n]+\|)*)" , "" , page_content
189+ )
190+
191+ # Split remaining content into meaningful chunks
192+ chunks = re .split (r"\n\s*\n" , page_content )
193+ for chunk in chunks :
194+ if chunk .strip ():
195+ texts .append (
196+ (page_num , chunk .strip ())
197+ ) # Store as tuple with page number
198+
199+ print (f"Found { len (tables )} tables and { len (texts )} text sections" )
200+ return tables , texts
201+
142202 def load_data (
143- self , file_path : Path , extra_info : Optional [dict ] = None , ** kwargs
203+ self ,
204+ file : Union [str , List [str ], Path ],
205+ extra_info : Optional [Dict ] = None ,
206+ ** load_kwargs : Any ,
144207 ) -> List [Document ]:
145- if "response_content" in kwargs :
146- # overriding response content if specified
147- content = kwargs ["response_content" ]
208+ """Load data from file path."""
209+ file_path = Path (file ) if isinstance (file , str ) else file
210+
211+ if "response_content" in load_kwargs :
212+ content = load_kwargs ["response_content" ]
148213 else :
149- # call original API
150214 pdf_id = self .send_pdf (file_path )
151215 content = self .get_processed_pdf (pdf_id )
152216
153217 if self .should_clean_pdf :
154218 content = self .clean_pdf (content )
155- tables , texts = parse_markdown_text_to_tables (content )
219+
220+ tables , texts = self .parse_markdown_text_to_tables (content )
156221 documents = []
157- for table in tables :
158- text = strip_special_chars_markdown (table )
222+
223+ # Handle tables
224+ for page_num , table_content in tables :
225+ text = strip_special_chars_markdown (table_content )
159226 metadata = {
160- "table_origin" : table ,
227+ "table_origin" : table_content ,
161228 "type" : "table" ,
229+ "page_label" : page_num ,
230+ "page_number" : page_num ,
162231 }
163232 if extra_info :
164233 metadata .update (extra_info )
@@ -171,8 +240,99 @@ def load_data(
171240 )
172241 )
173242
174- for text in texts :
175- metadata = {"source" : file_path .name , "type" : "text" }
176- documents .append (Document (text = text , metadata = metadata ))
243+ # Handle text sections
244+ for page_num , text_content in texts :
245+ if not text_content .strip ():
246+ continue
247+ metadata = {
248+ "source" : str (file_path ),
249+ "type" : "text" ,
250+ "page_label" : page_num ,
251+ "page_number" : page_num ,
252+ }
253+ if extra_info :
254+ metadata .update (extra_info )
255+ documents .append (Document (text = text_content , metadata = metadata ))
256+
257+ # Fallback if no content was parsed
258+ if not documents and content .strip ():
259+ metadata = {
260+ "source" : str (file_path ),
261+ "type" : "text" ,
262+ "page_label" : 1 ,
263+ "page_number" : 1 ,
264+ }
265+ if extra_info :
266+ metadata .update (extra_info )
267+ documents .append (Document (text = content .strip (), metadata = metadata ))
177268
178269 return documents
270+
271+ def lazy_load_data (
272+ self ,
273+ file : Union [str , List [str ], Path ],
274+ extra_info : Optional [Dict ] = None ,
275+ ** load_kwargs : Any ,
276+ ) -> Generator [Document , None , None ]:
277+ """Lazy load data from file path."""
278+ file_path = Path (file ) if isinstance (file , str ) else file
279+
280+ if "response_content" in load_kwargs :
281+ content = load_kwargs ["response_content" ]
282+ else :
283+ pdf_id = self .send_pdf (file_path )
284+ print (f"PDF ID: { pdf_id } " )
285+ content = self .get_processed_pdf (pdf_id )
286+
287+ if self .should_clean_pdf :
288+ content = self .clean_pdf (content )
289+
290+ tables , texts = self .parse_markdown_text_to_tables (content )
291+
292+ # Handle tables
293+ for page_num , table_content in tables : # Changed variable name for clarity
294+ text = strip_special_chars_markdown (table_content ) # Pass just the content
295+ metadata = {
296+ "table_origin" : table_content , # Use table_content here too
297+ "type" : "table" ,
298+ "page_label" : page_num ,
299+ "page_number" : page_num ,
300+ }
301+ if extra_info :
302+ metadata .update (extra_info )
303+ yield Document (
304+ text = text ,
305+ metadata = metadata ,
306+ metadata_template = "" ,
307+ metadata_seperator = "" ,
308+ )
309+
310+ # Handle text sections
311+ for page_num , text_content in texts : # Changed variable name for clarity
312+ if not text_content .strip ():
313+ continue
314+ metadata = {
315+ "source" : str (file_path ),
316+ "type" : "text" ,
317+ "page_label" : page_num ,
318+ "page_number" : page_num ,
319+ }
320+ if extra_info :
321+ metadata .update (extra_info )
322+ yield Document (
323+ text = text_content , metadata = metadata
324+ ) # Use text_content directly
325+
326+ # Fallback if no content was parsed
327+ if not (tables or texts ) and content .strip ():
328+ metadata = {
329+ "source" : str (file_path ),
330+ "type" : "text" ,
331+ "page_label" : 1 ,
332+ "page_number" : 1 ,
333+ }
334+ if extra_info :
335+ metadata .update (extra_info )
336+ yield Document (text = content .strip (), metadata = metadata )
337+
338+ print (f"Completed processing PDF: { file_path } " )
0 commit comments