@@ -1090,6 +1090,67 @@ async def pipeline_enqueue_file(
10901090
10911091 pdf_file = BytesIO (file )
10921092 reader = PdfReader (pdf_file )
1093+
1094+ # Check if PDF is encrypted
1095+ if reader .is_encrypted :
1096+ pdf_password = global_args .pdf_decrypt_password
1097+ if not pdf_password :
1098+ # PDF is encrypted but no password provided
1099+ error_files = [
1100+ {
1101+ "file_path" : str (file_path .name ),
1102+ "error_description" : "[File Extraction]PDF is encrypted but no password provided" ,
1103+ "original_error" : "Please set PDF_DECRYPT_PASSWORD environment variable to decrypt this PDF file" ,
1104+ "file_size" : file_size ,
1105+ }
1106+ ]
1107+ await rag .apipeline_enqueue_error_documents (
1108+ error_files , track_id
1109+ )
1110+ logger .error (
1111+ f"[File Extraction]PDF is encrypted but no password provided: { file_path .name } "
1112+ )
1113+ return False , track_id
1114+
1115+ # Try to decrypt with password
1116+ try :
1117+ decrypt_result = reader .decrypt (pdf_password )
1118+ if decrypt_result == 0 :
1119+ # Password is incorrect
1120+ error_files = [
1121+ {
1122+ "file_path" : str (file_path .name ),
1123+ "error_description" : "[File Extraction]Failed to decrypt PDF - incorrect password" ,
1124+ "original_error" : "The provided PDF_DECRYPT_PASSWORD is incorrect for this file" ,
1125+ "file_size" : file_size ,
1126+ }
1127+ ]
1128+ await rag .apipeline_enqueue_error_documents (
1129+ error_files , track_id
1130+ )
1131+ logger .error (
1132+ f"[File Extraction]Incorrect PDF password: { file_path .name } "
1133+ )
1134+ return False , track_id
1135+ except Exception as decrypt_error :
1136+ # Decryption process error
1137+ error_files = [
1138+ {
1139+ "file_path" : str (file_path .name ),
1140+ "error_description" : "[File Extraction]PDF decryption failed" ,
1141+ "original_error" : f"Error during PDF decryption: { str (decrypt_error )} " ,
1142+ "file_size" : file_size ,
1143+ }
1144+ ]
1145+ await rag .apipeline_enqueue_error_documents (
1146+ error_files , track_id
1147+ )
1148+ logger .error (
1149+ f"[File Extraction]PDF decryption error for { file_path .name } : { str (decrypt_error )} "
1150+ )
1151+ return False , track_id
1152+
1153+ # Extract text from PDF (encrypted PDFs are now decrypted, unencrypted PDFs proceed directly)
10931154 for page in reader .pages :
10941155 content += page .extract_text () + "\n "
10951156 except Exception as e :
0 commit comments