Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions env.example
Original file line number Diff line number Diff line change
Expand Up @@ -403,13 +403,22 @@ POSTGRES_VCHORDRQ_PROBES=
POSTGRES_VCHORDRQ_EPSILON=1.9

### PostgreSQL Connection Retry Configuration (Network Robustness)
### Number of retry attempts (1-10, default: 3)
### Initial retry backoff in seconds (0.1-5.0, default: 0.5)
### Maximum retry backoff in seconds (backoff-60.0, default: 5.0)
### NEW DEFAULTS (v1.4.10+): Optimized for HA deployments with ~30s switchover time
### These defaults provide out-of-the-box support for PostgreSQL High Availability setups
###
### Number of retry attempts (1-100, default: 10)
### - Default 10 attempts allows ~225s total retry time (sufficient for most HA scenarios)
### - For extreme cases: increase up to 20-50
### Initial retry backoff in seconds (0.1-300.0, default: 3.0)
### - Default 3.0s provides reasonable initial delay for switchover detection
### - For faster recovery: decrease to 1.0-2.0
### Maximum retry backoff in seconds (must be >= backoff, max: 600.0, default: 30.0)
### - Default 30.0s matches typical switchover completion time
### - For longer switchovers: increase to 60-90
### Connection pool close timeout in seconds (1.0-30.0, default: 5.0)
# POSTGRES_CONNECTION_RETRIES=3
# POSTGRES_CONNECTION_RETRY_BACKOFF=0.5
# POSTGRES_CONNECTION_RETRY_BACKOFF_MAX=5.0
# POSTGRES_CONNECTION_RETRIES=10
# POSTGRES_CONNECTION_RETRY_BACKOFF=3.0
# POSTGRES_CONNECTION_RETRY_BACKOFF_MAX=30.0
# POSTGRES_POOL_CLOSE_TIMEOUT=5.0

### PostgreSQL SSL Configuration (Optional)
Expand Down
12 changes: 6 additions & 6 deletions lightrag/kg/postgres_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1797,34 +1797,34 @@ def get_config() -> dict[str, Any]:
),
# Connection retry configuration
"connection_retry_attempts": min(
10,
100, # Increased from 10 to 100 for long-running operations
int(
os.environ.get(
"POSTGRES_CONNECTION_RETRIES",
config.get("postgres", "connection_retries", fallback=3),
config.get("postgres", "connection_retries", fallback=10),
)
),
),
"connection_retry_backoff": min(
5.0,
300.0, # Increased from 5.0 to 300.0 (5 minutes) for PG switchover scenarios
float(
os.environ.get(
"POSTGRES_CONNECTION_RETRY_BACKOFF",
config.get(
"postgres", "connection_retry_backoff", fallback=0.5
"postgres", "connection_retry_backoff", fallback=3.0
),
)
),
),
"connection_retry_backoff_max": min(
60.0,
600.0, # Increased from 60.0 to 600.0 (10 minutes) for PG switchover scenarios
float(
os.environ.get(
"POSTGRES_CONNECTION_RETRY_BACKOFF_MAX",
config.get(
"postgres",
"connection_retry_backoff_max",
fallback=5.0,
fallback=30.0,
),
)
),
Expand Down
20 changes: 12 additions & 8 deletions lightrag_webui/src/features/DocumentManager.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,7 @@ export default function DocumentManager() {
// Utility function to create timeout wrapper for API calls
const withTimeout = useCallback((
promise: Promise<any>,
timeoutMs: number = 30000,
timeoutMs: number = 30000, // Default 30s timeout for normal operations
errorMsg: string = 'Request timeout'
): Promise<any> => {
const timeoutPromise = new Promise((_, reject) => {
Expand Down Expand Up @@ -676,7 +676,8 @@ export default function DocumentManager() {
// Intelligent refresh function: handles all boundary cases
const handleIntelligentRefresh = useCallback(async (
targetPage?: number, // Optional target page, defaults to current page
resetToFirst?: boolean // Whether to force reset to first page
resetToFirst?: boolean, // Whether to force reset to first page
customTimeout?: number // Optional custom timeout in milliseconds (uses withTimeout default if not provided)
) => {
try {
if (!isMountedRef.current) return;
Expand All @@ -694,10 +695,10 @@ export default function DocumentManager() {
sort_direction: sortDirection
};

// Use timeout wrapper for the API call
// Use timeout wrapper for the API call (uses customTimeout if provided, otherwise withTimeout default)
const response = await withTimeout(
getDocumentsPaginated(request),
30000, // 30 second timeout
customTimeout, // Pass undefined to use default 30s, or explicit timeout for special cases
'Document fetch timeout'
);

Expand All @@ -717,7 +718,7 @@ export default function DocumentManager() {

const lastPageResponse = await withTimeout(
getDocumentsPaginated(lastPageRequest),
30000,
customTimeout, // Use same timeout for consistency
'Document fetch timeout'
);

Expand Down Expand Up @@ -847,7 +848,10 @@ export default function DocumentManager() {
// Reset health check timer with 1 second delay to avoid race condition
useBackendState.getState().resetHealthCheckTimerDelayed(1000);

// Start fast refresh with 2-second interval immediately after scan
// Perform immediate refresh with 90s timeout after scan (tolerates PostgreSQL switchover)
await handleIntelligentRefresh(undefined, false, 90000);

// Start fast refresh with 2-second interval after initial refresh
startPollingInterval(2000);

// Set recovery timer to restore normal polling interval after 15 seconds
Expand All @@ -865,7 +869,7 @@ export default function DocumentManager() {
toast.error(t('documentPanel.documentManager.errors.scanFailed', { error: errorMessage(err) }));
}
}
}, [t, startPollingInterval, currentTab, health, statusCounts])
}, [t, startPollingInterval, currentTab, health, statusCounts, handleIntelligentRefresh])

// Handle page size change - update state and save to store
const handlePageSizeChange = useCallback((newPageSize: number) => {
Expand Down Expand Up @@ -1184,7 +1188,7 @@ export default function DocumentManager() {
) : !isSelectionMode ? (
<ClearDocumentsDialog onDocumentsCleared={handleDocumentsCleared} />
) : null}
<UploadDocumentsDialog onDocumentsUploaded={fetchDocuments} />
<UploadDocumentsDialog onDocumentsUploaded={() => handleIntelligentRefresh(undefined, false, 120000)} />
<PipelineStatusDialog
open={showPipelineStatus}
onOpenChange={setShowPipelineStatus}
Expand Down
8 changes: 4 additions & 4 deletions tests/test_postgres_retry_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@ def db_config(self):
"database": os.getenv("POSTGRES_DATABASE", "postgres"),
"workspace": os.getenv("POSTGRES_WORKSPACE", "test_retry"),
"max_connections": int(os.getenv("POSTGRES_MAX_CONNECTIONS", "10")),
# Connection retry configuration
# Connection retry configuration - mirrors postgres_impl.py ClientManager.get_config()
"connection_retry_attempts": min(
10, int(os.getenv("POSTGRES_CONNECTION_RETRIES", "3"))
100, int(os.getenv("POSTGRES_CONNECTION_RETRIES", "3"))
),
"connection_retry_backoff": min(
5.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "0.5"))
300.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "0.5"))

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Sync integration test defaults with retry config

The db_config fixture still seeds connection retries with the old defaults (3 attempts with 0.5s backoff) even though ClientManager.get_config now defaults to 10 attempts with a 3s backoff for HA tolerance. Running the integration suite without overriding env vars will continue exercising the legacy, non-HA settings and won’t detect regressions in the new default behavior described in this commit.

Useful? React with 👍 / 👎.

),
"connection_retry_backoff_max": min(
60.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "5.0"))
600.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "5.0"))
),
"pool_close_timeout": min(
30.0, float(os.getenv("POSTGRES_POOL_CLOSE_TIMEOUT", "5.0"))
Expand Down
Loading