Skip to content

Commit ddc8a54

Browse files
fix: continue monitoring if unhandled Exception is thrown
chore: add additional logging for efm
1 parent 0eb9a4e commit ddc8a54

File tree

2 files changed

+105
-77
lines changed

2 files changed

+105
-77
lines changed

wrapper/src/main/java/software/amazon/jdbc/plugin/efm/MonitorImpl.java

Lines changed: 101 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ public MonitorImpl(
100100
this.properties = properties;
101101
this.monitorDisposalTimeMillis = monitorDisposalTimeMillis;
102102
this.monitorService = monitorService;
103-
103+
104104
this.contextLastUsedTimestampNano = this.getCurrentTimeNano();
105105
this.contextsSizeGauge = telemetryFactory.createGauge("efm.activeContexts.queue.size",
106106
() -> (long) activeContexts.size());
@@ -113,6 +113,9 @@ public MonitorImpl(
113113

114114
@Override
115115
public void startMonitoring(final MonitorConnectionContext context) {
116+
if (this.stopped) {
117+
LOGGER.warning(() -> Messages.get("MonitorImpl.monitorIsStopped", new Object[] {this.hostSpec.getHost()}));
118+
}
116119
final long currentTimeNano = this.getCurrentTimeNano();
117120
context.setStartMonitorTimeNano(currentTimeNano);
118121
this.contextLastUsedTimestampNano = currentTimeNano;
@@ -143,107 +146,128 @@ public void run() {
143146
try {
144147
this.stopped = false;
145148
while (true) {
149+
try {
146150

147-
// process new contexts
148-
MonitorConnectionContext newMonitorContext;
149-
MonitorConnectionContext firstAddedNewMonitorContext = null;
150-
final long currentTimeNano = this.getCurrentTimeNano();
151-
while ((newMonitorContext = this.newContexts.poll()) != null) {
152-
if (firstAddedNewMonitorContext == newMonitorContext) {
153-
// This context has already been processed.
154-
// Add it back to the queue and process it in the next round.
155-
this.newContexts.add(newMonitorContext);
156-
break;
157-
}
158-
if (newMonitorContext.isActiveContext()) {
159-
if (newMonitorContext.getExpectedActiveMonitoringStartTimeNano() > currentTimeNano) {
160-
// The context active monitoring time hasn't come.
161-
// Add the context to the queue and check it later.
151+
// process new contexts
152+
MonitorConnectionContext newMonitorContext;
153+
MonitorConnectionContext firstAddedNewMonitorContext = null;
154+
final long currentTimeNano = this.getCurrentTimeNano();
155+
while ((newMonitorContext = this.newContexts.poll()) != null) {
156+
if (firstAddedNewMonitorContext == newMonitorContext) {
157+
// This context has already been processed.
158+
// Add it back to the queue and process it in the next round.
162159
this.newContexts.add(newMonitorContext);
163-
if (firstAddedNewMonitorContext == null) {
164-
firstAddedNewMonitorContext = newMonitorContext;
160+
break;
161+
}
162+
if (newMonitorContext.isActiveContext()) {
163+
if (newMonitorContext.getExpectedActiveMonitoringStartTimeNano() > currentTimeNano) {
164+
// The context active monitoring time hasn't come.
165+
// Add the context to the queue and check it later.
166+
this.newContexts.add(newMonitorContext);
167+
if (firstAddedNewMonitorContext == null) {
168+
firstAddedNewMonitorContext = newMonitorContext;
169+
}
170+
} else {
171+
// It's time to start actively monitor this context.
172+
this.activeContexts.add(newMonitorContext);
165173
}
166-
} else {
167-
// It's time to start actively monitor this context.
168-
this.activeContexts.add(newMonitorContext);
169174
}
170175
}
171-
}
172176

173-
if (!this.activeContexts.isEmpty()) {
177+
if (!this.activeContexts.isEmpty()) {
174178

175-
final long statusCheckStartTimeNano = this.getCurrentTimeNano();
176-
this.contextLastUsedTimestampNano = statusCheckStartTimeNano;
179+
final long statusCheckStartTimeNano = this.getCurrentTimeNano();
180+
this.contextLastUsedTimestampNano = statusCheckStartTimeNano;
177181

178-
final ConnectionStatus status =
179-
checkConnectionStatus(this.nodeCheckTimeoutMillis);
182+
final ConnectionStatus status =
183+
checkConnectionStatus(this.nodeCheckTimeoutMillis);
180184

181-
long delayMillis = -1;
182-
MonitorConnectionContext monitorContext;
183-
MonitorConnectionContext firstAddedMonitorContext = null;
185+
long delayMillis = -1;
186+
MonitorConnectionContext monitorContext;
187+
MonitorConnectionContext firstAddedMonitorContext = null;
184188

185-
while ((monitorContext = this.activeContexts.poll()) != null) {
186-
187-
synchronized (monitorContext) {
188-
// If context is already invalid, just skip it
189-
if (!monitorContext.isActiveContext()) {
190-
continue;
191-
}
189+
while ((monitorContext = this.activeContexts.poll()) != null) {
192190

193-
if (firstAddedMonitorContext == monitorContext) {
194-
// this context has already been processed by this loop
195-
// add it to the queue and exit this loop
196-
this.activeContexts.add(monitorContext);
197-
break;
198-
}
191+
synchronized (monitorContext) {
192+
// If context is already invalid, just skip it
193+
if (!monitorContext.isActiveContext()) {
194+
continue;
195+
}
199196

200-
// otherwise, process this context
201-
monitorContext.updateConnectionStatus(
202-
this.hostSpec.getUrl(),
203-
statusCheckStartTimeNano,
204-
statusCheckStartTimeNano + status.elapsedTimeNano,
205-
status.isValid);
206-
207-
// If context is still valid and node is still healthy, it needs to continue updating this context
208-
if (monitorContext.isActiveContext() && !monitorContext.isNodeUnhealthy()) {
209-
this.activeContexts.add(monitorContext);
210-
if (firstAddedMonitorContext == null) {
211-
firstAddedMonitorContext = monitorContext;
197+
if (firstAddedMonitorContext == monitorContext) {
198+
// this context has already been processed by this loop
199+
// add it to the queue and exit this loop
200+
this.activeContexts.add(monitorContext);
201+
break;
212202
}
213203

214-
if (delayMillis == -1 || delayMillis > monitorContext.getFailureDetectionIntervalMillis()) {
215-
delayMillis = monitorContext.getFailureDetectionIntervalMillis();
204+
// otherwise, process this context
205+
monitorContext.updateConnectionStatus(
206+
this.hostSpec.getUrl(),
207+
statusCheckStartTimeNano,
208+
statusCheckStartTimeNano + status.elapsedTimeNano,
209+
status.isValid);
210+
211+
// If context is still valid and node is still healthy, it needs to continue updating this context
212+
if (monitorContext.isActiveContext() && !monitorContext.isNodeUnhealthy()) {
213+
this.activeContexts.add(monitorContext);
214+
if (firstAddedMonitorContext == null) {
215+
firstAddedMonitorContext = monitorContext;
216+
}
217+
218+
if (delayMillis == -1 || delayMillis > monitorContext.getFailureDetectionIntervalMillis()) {
219+
delayMillis = monitorContext.getFailureDetectionIntervalMillis();
220+
}
216221
}
217222
}
218223
}
219-
}
220224

221-
if (delayMillis == -1) {
222-
// No active contexts
223-
delayMillis = THREAD_SLEEP_WHEN_INACTIVE_MILLIS;
224-
} else {
225-
delayMillis -= status.elapsedTimeNano;
226-
// Check for min delay between node health check
227-
if (delayMillis <= 0) {
228-
delayMillis = MIN_CONNECTION_CHECK_TIMEOUT_MILLIS;
225+
if (delayMillis == -1) {
226+
// No active contexts
227+
delayMillis = THREAD_SLEEP_WHEN_INACTIVE_MILLIS;
228+
} else {
229+
delayMillis -= status.elapsedTimeNano;
230+
// Check for min delay between node health check
231+
if (delayMillis <= 0) {
232+
delayMillis = MIN_CONNECTION_CHECK_TIMEOUT_MILLIS;
233+
}
234+
// Use this delay as node checkout timeout since it corresponds to min interval for all active contexts
235+
this.nodeCheckTimeoutMillis = delayMillis;
229236
}
230-
// Use this delay as node checkout timeout since it corresponds to min interval for all active contexts
231-
this.nodeCheckTimeoutMillis = delayMillis;
232-
}
233237

234-
TimeUnit.MILLISECONDS.sleep(delayMillis);
238+
TimeUnit.MILLISECONDS.sleep(delayMillis);
235239

236-
} else {
237-
if ((this.getCurrentTimeNano() - this.contextLastUsedTimestampNano)
238-
>= TimeUnit.MILLISECONDS.toNanos(this.monitorDisposalTimeMillis)) {
239-
monitorService.notifyUnused(this);
240-
break;
240+
} else {
241+
if ((this.getCurrentTimeNano() - this.contextLastUsedTimestampNano)
242+
>= TimeUnit.MILLISECONDS.toNanos(this.monitorDisposalTimeMillis)) {
243+
monitorService.notifyUnused(this);
244+
break;
245+
}
246+
TimeUnit.MILLISECONDS.sleep(THREAD_SLEEP_WHEN_INACTIVE_MILLIS);
241247
}
242-
TimeUnit.MILLISECONDS.sleep(THREAD_SLEEP_WHEN_INACTIVE_MILLIS);
248+
249+
} catch (final InterruptedException intEx) {
250+
throw intEx;
251+
} catch (final Exception ex) {
252+
// log and ignore
253+
LOGGER.warning(
254+
() -> Messages.get(
255+
"MonitorImpl.exceptionDuringMonitoringContinue",
256+
new Object[] {this.hostSpec.getHost(), ex.getMessage()}));
243257
}
244258
}
245259
} catch (final InterruptedException intEx) {
246-
// do nothing; exit thread
260+
// exit thread
261+
LOGGER.warning(
262+
() -> Messages.get(
263+
"MonitorImpl.interruptedExceptionDuringMonitoring",
264+
new Object[] {this.hostSpec.getHost(), intEx.getMessage()}));
265+
} catch (final Exception ex) {
266+
// this should not be reached; log and exit thread
267+
LOGGER.warning(
268+
() -> Messages.get(
269+
"MonitorImpl.exceptionDuringMonitoringStop",
270+
new Object[] {this.hostSpec.getHost(), ex.getMessage()}));
247271
} finally {
248272
if (this.monitoringConn != null) {
249273
try {

wrapper/src/main/resources/aws_advanced_jdbc_wrapper_messages.properties

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,10 @@ MonitorThreadContainer.emptyNodeKeys=Provided node keys are empty.
188188

189189
# Monitor Impl
190190
MonitorImpl.contextNullWarning=Parameter 'context' should not be null.
191+
MonitorImpl.interruptedExceptionDuringMonitoring=Monitoring thread for node {0} was interrupted: {1}
192+
MonitorImpl.exceptionDuringMonitoringContinue=Continuing monitoring after unhandled exception in monitoring thread for node {0}: {1}
193+
MonitorImpl.exceptionDuringMonitoringStop=Stopping monitoring after unhandled exception in monitoring thread for node {0}: {1}
194+
MonitorImpl.monitorIsStopped=Monitoring was already stopped for node {0}.
191195

192196
# Monitor Service Impl
193197
MonitorServiceImpl.nullMonitorParam=Parameter 'monitor' should not be null.

0 commit comments

Comments
 (0)