Skip to content

Commit 8ac26c9

Browse files
cleo-iaclaude
andcommitted
fix: enhance PostgreSQL reconnection tolerance for HA deployments
Cherry-picked from HKUDS/LightRAG PR HKUDS#2562. Backend changes: - connection_retry_attempts: max 10 → 100, default 3 → 10 - connection_retry_backoff: max 5s → 300s, default 0.5s → 3s - connection_retry_backoff_max: max 60s → 600s, default 5s → 30s Frontend changes: - Context-aware timeouts: default 30s, uploads 120s, scans 90s This enables seamless recovery during ~30s PostgreSQL HA switchovers without manual intervention. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 16e65b4 commit 8ac26c9

File tree

4 files changed

+61
-38
lines changed

4 files changed

+61
-38
lines changed

env.example

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -390,13 +390,22 @@ POSTGRES_VCHORDRQ_PROBES=
390390
POSTGRES_VCHORDRQ_EPSILON=1.9
391391

392392
### PostgreSQL Connection Retry Configuration (Network Robustness)
393-
### Number of retry attempts (1-10, default: 3)
394-
### Initial retry backoff in seconds (0.1-5.0, default: 0.5)
395-
### Maximum retry backoff in seconds (backoff-60.0, default: 5.0)
393+
### NEW DEFAULTS (v1.4.10+): Optimized for HA deployments with ~30s switchover time
394+
### These defaults provide out-of-the-box support for PostgreSQL High Availability setups
395+
###
396+
### Number of retry attempts (1-100, default: 10)
397+
### - Default 10 attempts allows ~225s total retry time (sufficient for most HA scenarios)
398+
### - For extreme cases: increase up to 20-50
399+
### Initial retry backoff in seconds (0.1-300.0, default: 3.0)
400+
### - Default 3.0s provides reasonable initial delay for switchover detection
401+
### - For faster recovery: decrease to 1.0-2.0
402+
### Maximum retry backoff in seconds (must be >= backoff, max: 600.0, default: 30.0)
403+
### - Default 30.0s matches typical switchover completion time
404+
### - For longer switchovers: increase to 60-90
396405
### Connection pool close timeout in seconds (1.0-30.0, default: 5.0)
397-
# POSTGRES_CONNECTION_RETRIES=3
398-
# POSTGRES_CONNECTION_RETRY_BACKOFF=0.5
399-
# POSTGRES_CONNECTION_RETRY_BACKOFF_MAX=5.0
406+
# POSTGRES_CONNECTION_RETRIES=10
407+
# POSTGRES_CONNECTION_RETRY_BACKOFF=3.0
408+
# POSTGRES_CONNECTION_RETRY_BACKOFF_MAX=30.0
400409
# POSTGRES_POOL_CLOSE_TIMEOUT=5.0
401410

402411
### PostgreSQL SSL Configuration (Optional)

lightrag/kg/postgres_impl.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1619,34 +1619,34 @@ def get_config() -> dict[str, Any]:
16191619
),
16201620
# Connection retry configuration
16211621
"connection_retry_attempts": min(
1622-
10,
1622+
100, # Increased from 10 to 100 for long-running operations
16231623
int(
16241624
os.environ.get(
16251625
"POSTGRES_CONNECTION_RETRIES",
1626-
config.get("postgres", "connection_retries", fallback=3),
1626+
config.get("postgres", "connection_retries", fallback=10),
16271627
)
16281628
),
16291629
),
16301630
"connection_retry_backoff": min(
1631-
5.0,
1631+
300.0, # Increased from 5.0 to 300.0 (5 minutes) for PG switchover scenarios
16321632
float(
16331633
os.environ.get(
16341634
"POSTGRES_CONNECTION_RETRY_BACKOFF",
16351635
config.get(
1636-
"postgres", "connection_retry_backoff", fallback=0.5
1636+
"postgres", "connection_retry_backoff", fallback=3.0
16371637
),
16381638
)
16391639
),
16401640
),
16411641
"connection_retry_backoff_max": min(
1642-
60.0,
1642+
600.0, # Increased from 60.0 to 600.0 (10 minutes) for PG switchover scenarios
16431643
float(
16441644
os.environ.get(
16451645
"POSTGRES_CONNECTION_RETRY_BACKOFF_MAX",
16461646
config.get(
16471647
"postgres",
16481648
"connection_retry_backoff_max",
1649-
fallback=5.0,
1649+
fallback=30.0,
16501650
),
16511651
)
16521652
),

lightrag_webui/src/features/DocumentManager.tsx

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,7 @@ export default function DocumentManager() {
584584
// Utility function to create timeout wrapper for API calls
585585
const withTimeout = useCallback((
586586
promise: Promise<any>,
587-
timeoutMs: number = 30000,
587+
timeoutMs: number = 30000, // Default 30s timeout for normal operations
588588
errorMsg: string = 'Request timeout'
589589
): Promise<any> => {
590590
const timeoutPromise = new Promise((_, reject) => {
@@ -676,7 +676,8 @@ export default function DocumentManager() {
676676
// Intelligent refresh function: handles all boundary cases
677677
const handleIntelligentRefresh = useCallback(async (
678678
targetPage?: number, // Optional target page, defaults to current page
679-
resetToFirst?: boolean // Whether to force reset to first page
679+
resetToFirst?: boolean, // Whether to force reset to first page
680+
customTimeout?: number // Optional custom timeout in milliseconds (uses withTimeout default if not provided)
680681
) => {
681682
try {
682683
if (!isMountedRef.current) return;
@@ -694,10 +695,10 @@ export default function DocumentManager() {
694695
sort_direction: sortDirection
695696
};
696697

697-
// Use timeout wrapper for the API call
698+
// Use timeout wrapper for the API call (uses customTimeout if provided, otherwise withTimeout default)
698699
const response = await withTimeout(
699700
getDocumentsPaginated(request),
700-
30000, // 30 second timeout
701+
customTimeout, // Pass undefined to use default 30s, or explicit timeout for special cases
701702
'Document fetch timeout'
702703
);
703704

@@ -717,7 +718,7 @@ export default function DocumentManager() {
717718

718719
const lastPageResponse = await withTimeout(
719720
getDocumentsPaginated(lastPageRequest),
720-
30000,
721+
customTimeout, // Use same timeout for consistency
721722
'Document fetch timeout'
722723
);
723724

@@ -847,7 +848,10 @@ export default function DocumentManager() {
847848
// Reset health check timer with 1 second delay to avoid race condition
848849
useBackendState.getState().resetHealthCheckTimerDelayed(1000);
849850

850-
// Start fast refresh with 2-second interval immediately after scan
851+
// Perform immediate refresh with 90s timeout after scan (tolerates PostgreSQL switchover)
852+
await handleIntelligentRefresh(undefined, false, 90000);
853+
854+
// Start fast refresh with 2-second interval after initial refresh
851855
startPollingInterval(2000);
852856

853857
// Set recovery timer to restore normal polling interval after 15 seconds
@@ -865,7 +869,7 @@ export default function DocumentManager() {
865869
toast.error(t('documentPanel.documentManager.errors.scanFailed', { error: errorMessage(err) }));
866870
}
867871
}
868-
}, [t, startPollingInterval, currentTab, health, statusCounts])
872+
}, [t, startPollingInterval, currentTab, health, statusCounts, handleIntelligentRefresh])
869873

870874
// Handle page size change - update state and save to store
871875
const handlePageSizeChange = useCallback((newPageSize: number) => {
@@ -1184,7 +1188,7 @@ export default function DocumentManager() {
11841188
) : !isSelectionMode ? (
11851189
<ClearDocumentsDialog onDocumentsCleared={handleDocumentsCleared} />
11861190
) : null}
1187-
<UploadDocumentsDialog onDocumentsUploaded={fetchDocuments} />
1191+
<UploadDocumentsDialog onDocumentsUploaded={() => handleIntelligentRefresh(undefined, false, 120000)} />
11881192
<PipelineStatusDialog
11891193
open={showPipelineStatus}
11901194
onOpenChange={setShowPipelineStatus}

tests/test_postgres_retry_integration.py

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,13 @@ class TestPostgresRetryIntegration:
3131

3232
@pytest.fixture
3333
def db_config(self):
34-
"""Load database configuration from environment variables."""
34+
"""Load database configuration from environment variables.
35+
36+
Uses new HA-optimized defaults that match postgres_impl.py ClientManager.get_config():
37+
- 10 retry attempts (up from 3)
38+
- 3.0s initial backoff (up from 0.5s)
39+
- 30.0s max backoff (up from 5.0s)
40+
"""
3541
return {
3642
"host": os.getenv("POSTGRES_HOST", "localhost"),
3743
"port": int(os.getenv("POSTGRES_PORT", "5432")),
@@ -40,31 +46,31 @@ def db_config(self):
4046
"database": os.getenv("POSTGRES_DATABASE", "postgres"),
4147
"workspace": os.getenv("POSTGRES_WORKSPACE", "test_retry"),
4248
"max_connections": int(os.getenv("POSTGRES_MAX_CONNECTIONS", "10")),
43-
# Connection retry configuration
49+
# Connection retry configuration - mirrors postgres_impl.py ClientManager.get_config()
50+
# NEW DEFAULTS optimized for HA deployments
4451
"connection_retry_attempts": min(
45-
10, int(os.getenv("POSTGRES_CONNECTION_RETRIES", "3"))
52+
100,
53+
int(os.getenv("POSTGRES_CONNECTION_RETRIES", "10")), # 3 → 10
4654
),
4755
"connection_retry_backoff": min(
48-
5.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "0.5"))
56+
300.0,
57+
float(
58+
os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "3.0")
59+
), # 0.5 → 3.0
4960
),
5061
"connection_retry_backoff_max": min(
51-
60.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "5.0"))
62+
600.0,
63+
float(
64+
os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "30.0")
65+
), # 5.0 → 30.0
5266
),
5367
"pool_close_timeout": min(
5468
30.0, float(os.getenv("POSTGRES_POOL_CLOSE_TIMEOUT", "5.0"))
5569
),
5670
}
5771

58-
@pytest.fixture
59-
def test_env(self, monkeypatch):
60-
"""Set up test environment variables for retry configuration."""
61-
monkeypatch.setenv("POSTGRES_CONNECTION_RETRIES", "3")
62-
monkeypatch.setenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "0.5")
63-
monkeypatch.setenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "2.0")
64-
monkeypatch.setenv("POSTGRES_POOL_CLOSE_TIMEOUT", "3.0")
65-
6672
@pytest.mark.asyncio
67-
async def test_real_connection_success(self, db_config, test_env):
73+
async def test_real_connection_success(self, db_config):
6874
"""
6975
Test successful connection to real PostgreSQL database.
7076
@@ -100,11 +106,12 @@ async def test_real_connection_success(self, db_config, test_env):
100106
await db.pool.close()
101107

102108
@pytest.mark.asyncio
103-
async def test_simulated_transient_error_with_real_db(self, db_config, test_env):
109+
async def test_simulated_transient_error_with_real_db(self, db_config):
104110
"""
105111
Test retry mechanism with simulated transient errors on real database.
106112
107113
Simulates connection failures on first 2 attempts, then succeeds.
114+
Uses new HA defaults (10 retries, 3s backoff).
108115
"""
109116
print("\n" + "=" * 80)
110117
print("INTEGRATION TEST 2: Simulated Transient Errors")
@@ -155,12 +162,13 @@ async def mock_create_pool_with_failures(*args, **kwargs):
155162
await db.pool.close()
156163

157164
@pytest.mark.asyncio
158-
async def test_query_retry_with_real_db(self, db_config, test_env):
165+
async def test_query_retry_with_real_db(self, db_config):
159166
"""
160167
Test query-level retry with simulated connection issues.
161168
162169
Tests that queries retry on transient failures by simulating
163170
a temporary database unavailability.
171+
Uses new HA defaults (10 retries, 3s backoff).
164172
"""
165173
print("\n" + "=" * 80)
166174
print("INTEGRATION TEST 3: Query-Level Retry")
@@ -193,11 +201,12 @@ async def test_query_retry_with_real_db(self, db_config, test_env):
193201
await db.pool.close()
194202

195203
@pytest.mark.asyncio
196-
async def test_concurrent_queries_with_real_db(self, db_config, test_env):
204+
async def test_concurrent_queries_with_real_db(self, db_config):
197205
"""
198206
Test concurrent queries to validate thread safety and connection pooling.
199207
200208
Runs multiple concurrent queries to ensure no deadlocks or race conditions.
209+
Uses new HA defaults (10 retries, 3s backoff).
201210
"""
202211
print("\n" + "=" * 80)
203212
print("INTEGRATION TEST 4: Concurrent Queries")
@@ -243,9 +252,10 @@ async def run_query(query_id):
243252
await db.pool.close()
244253

245254
@pytest.mark.asyncio
246-
async def test_pool_close_timeout_real(self, db_config, test_env):
255+
async def test_pool_close_timeout_real(self, db_config):
247256
"""
248257
Test pool close timeout protection with real database.
258+
Uses new HA defaults (10 retries, 3s backoff).
249259
"""
250260
print("\n" + "=" * 80)
251261
print("INTEGRATION TEST 5: Pool Close Timeout")

0 commit comments

Comments
 (0)