Skip to content

Commit ece0398

Browse files
authored
Merge pull request #2296 from danielaskdd/pdf-decryption
Feat: Add PDF Decryption Support for Password-Protected Files
2 parents 728721b + 61b57cb commit ece0398

File tree

3 files changed

+67
-0
lines changed

3 files changed

+67
-0
lines changed

env.example

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,9 @@ ENABLE_LLM_CACHE_FOR_EXTRACT=true
119119
### Document processing output language: English, Chinese, French, German ...
120120
SUMMARY_LANGUAGE=English
121121

122+
### PDF decryption password for protected PDF files
123+
# PDF_DECRYPT_PASSWORD=your_pdf_password_here
124+
122125
### Entity types that the LLM will attempt to recognize
123126
# ENTITY_TYPES='["Person", "Creature", "Organization", "Location", "Event", "Concept", "Method", "Content", "Data", "Artifact", "NaturalObject"]'
124127

lightrag/api/config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,9 @@ def parse_args() -> argparse.Namespace:
342342
# Select Document loading tool (DOCLING, DEFAULT)
343343
args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT")
344344

345+
# PDF decryption password
346+
args.pdf_decrypt_password = get_env_value("PDF_DECRYPT_PASSWORD", None)
347+
345348
# Add environment variables that were previously read directly
346349
args.cors_origins = get_env_value("CORS_ORIGINS", "*")
347350
args.summary_language = get_env_value("SUMMARY_LANGUAGE", DEFAULT_SUMMARY_LANGUAGE)

lightrag/api/routers/document_routes.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1090,6 +1090,67 @@ async def pipeline_enqueue_file(
10901090

10911091
pdf_file = BytesIO(file)
10921092
reader = PdfReader(pdf_file)
1093+
1094+
# Check if PDF is encrypted
1095+
if reader.is_encrypted:
1096+
pdf_password = global_args.pdf_decrypt_password
1097+
if not pdf_password:
1098+
# PDF is encrypted but no password provided
1099+
error_files = [
1100+
{
1101+
"file_path": str(file_path.name),
1102+
"error_description": "[File Extraction]PDF is encrypted but no password provided",
1103+
"original_error": "Please set PDF_DECRYPT_PASSWORD environment variable to decrypt this PDF file",
1104+
"file_size": file_size,
1105+
}
1106+
]
1107+
await rag.apipeline_enqueue_error_documents(
1108+
error_files, track_id
1109+
)
1110+
logger.error(
1111+
f"[File Extraction]PDF is encrypted but no password provided: {file_path.name}"
1112+
)
1113+
return False, track_id
1114+
1115+
# Try to decrypt with password
1116+
try:
1117+
decrypt_result = reader.decrypt(pdf_password)
1118+
if decrypt_result == 0:
1119+
# Password is incorrect
1120+
error_files = [
1121+
{
1122+
"file_path": str(file_path.name),
1123+
"error_description": "[File Extraction]Failed to decrypt PDF - incorrect password",
1124+
"original_error": "The provided PDF_DECRYPT_PASSWORD is incorrect for this file",
1125+
"file_size": file_size,
1126+
}
1127+
]
1128+
await rag.apipeline_enqueue_error_documents(
1129+
error_files, track_id
1130+
)
1131+
logger.error(
1132+
f"[File Extraction]Incorrect PDF password: {file_path.name}"
1133+
)
1134+
return False, track_id
1135+
except Exception as decrypt_error:
1136+
# Decryption process error
1137+
error_files = [
1138+
{
1139+
"file_path": str(file_path.name),
1140+
"error_description": "[File Extraction]PDF decryption failed",
1141+
"original_error": f"Error during PDF decryption: {str(decrypt_error)}",
1142+
"file_size": file_size,
1143+
}
1144+
]
1145+
await rag.apipeline_enqueue_error_documents(
1146+
error_files, track_id
1147+
)
1148+
logger.error(
1149+
f"[File Extraction]PDF decryption error for {file_path.name}: {str(decrypt_error)}"
1150+
)
1151+
return False, track_id
1152+
1153+
# Extract text from PDF (encrypted PDFs are now decrypted, unencrypted PDFs proceed directly)
10931154
for page in reader.pages:
10941155
content += page.extract_text() + "\n"
10951156
except Exception as e:

0 commit comments

Comments
 (0)