Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions env.example
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ ENABLE_LLM_CACHE_FOR_EXTRACT=true
### Document processing output language: English, Chinese, French, German ...
SUMMARY_LANGUAGE=English

### PDF decryption password for protected PDF files
# PDF_DECRYPT_PASSWORD=your_pdf_password_here

### Entity types that the LLM will attempt to recognize
# ENTITY_TYPES='["Person", "Creature", "Organization", "Location", "Event", "Concept", "Method", "Content", "Data", "Artifact", "NaturalObject"]'

Expand Down
3 changes: 3 additions & 0 deletions lightrag/api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,9 @@ def parse_args() -> argparse.Namespace:
# Select Document loading tool (DOCLING, DEFAULT)
args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT")

# PDF decryption password
args.pdf_decrypt_password = get_env_value("PDF_DECRYPT_PASSWORD", None)

# Add environment variables that were previously read directly
args.cors_origins = get_env_value("CORS_ORIGINS", "*")
args.summary_language = get_env_value("SUMMARY_LANGUAGE", DEFAULT_SUMMARY_LANGUAGE)
Expand Down
61 changes: 61 additions & 0 deletions lightrag/api/routers/document_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1090,6 +1090,67 @@ async def pipeline_enqueue_file(

pdf_file = BytesIO(file)
reader = PdfReader(pdf_file)

# Check if PDF is encrypted
if reader.is_encrypted:
pdf_password = global_args.pdf_decrypt_password
if not pdf_password:
# PDF is encrypted but no password provided
error_files = [
{
"file_path": str(file_path.name),
"error_description": "[File Extraction]PDF is encrypted but no password provided",
"original_error": "Please set PDF_DECRYPT_PASSWORD environment variable to decrypt this PDF file",
"file_size": file_size,
}
]
await rag.apipeline_enqueue_error_documents(
error_files, track_id
)
logger.error(
f"[File Extraction]PDF is encrypted but no password provided: {file_path.name}"
)
return False, track_id

# Try to decrypt with password
try:
decrypt_result = reader.decrypt(pdf_password)
if decrypt_result == 0:
# Password is incorrect
error_files = [
{
"file_path": str(file_path.name),
"error_description": "[File Extraction]Failed to decrypt PDF - incorrect password",
"original_error": "The provided PDF_DECRYPT_PASSWORD is incorrect for this file",
"file_size": file_size,
}
]
await rag.apipeline_enqueue_error_documents(
error_files, track_id
)
logger.error(
f"[File Extraction]Incorrect PDF password: {file_path.name}"
)
return False, track_id
except Exception as decrypt_error:
# Decryption process error
error_files = [
{
"file_path": str(file_path.name),
"error_description": "[File Extraction]PDF decryption failed",
"original_error": f"Error during PDF decryption: {str(decrypt_error)}",
"file_size": file_size,
}
]
await rag.apipeline_enqueue_error_documents(
error_files, track_id
)
logger.error(
f"[File Extraction]PDF decryption error for {file_path.name}: {str(decrypt_error)}"
)
return False, track_id

# Extract text from PDF (encrypted PDFs are now decrypted, unencrypted PDFs proceed directly)
for page in reader.pages:
content += page.extract_text() + "\n"
except Exception as e:
Expand Down