Skip to content

Commit a78cc52

Browse files
committed
fix(dns): check for negative_ttl of zero
see linkerd/linkerd2#14954. some user reports describe situations in which, when the linkerd control plane's destination controller is OOM-killed, DNS resolution can momentarily cause the proxy to compute a negative-TTL duration of zero. this causes a panic in production environments, because `tokio::time::interval` asserts that it has not been provided a duration of zero. this manifests in errors that look like this: ``` thread 'main' panicked at linkerd/app/core/src/control.rs:118:49: period must be non-zero. ``` this commit patches `linkerd-dns::ResolveError::negative_ttl()` so that it will now log a warning and instead return `None` when a negative TTL of zero is encountered. a shared `duration_from_error()` helper (bikeshedding welcome) helps do this for both A/AAAA and SRV records. X-Ref: #3807 Signed-off-by: katelyn martin <kate@buoyant.io>
1 parent afb7b79 commit a78cc52

File tree

1 file changed

+33
-21
lines changed

1 file changed

+33
-21
lines changed

linkerd/dns/src/lib.rs

Lines changed: 33 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -201,32 +201,44 @@ impl fmt::Debug for Resolver {
201201
// === impl ResolveError ===
202202

203203
impl ResolveError {
204-
/// Returns the amount of time that the resolver should wait before
205-
/// retrying.
204+
/// Returns the amount of time that the resolver should wait before retrying.
206205
pub fn negative_ttl(&self) -> Option<time::Duration> {
207-
if let Some(hickory_resolver::proto::ProtoErrorKind::NoRecordsFound {
208-
negative_ttl: Some(ttl_secs),
209-
..
210-
}) = self
211-
.a_error
212-
.0
213-
.proto()
214-
.map(hickory_resolver::proto::ProtoError::kind)
215-
{
216-
return Some(time::Duration::from_secs(*ttl_secs as u64));
206+
let Self {
207+
a_error: ARecordError(a_error),
208+
srv_error,
209+
} = self;
210+
211+
match Self::duration_from_error(a_error) {
212+
ttl @ Some(_) => return ttl,
213+
None => {}
217214
}
218215

219-
if let SrvRecordError::Resolve(error) = &self.srv_error {
220-
if let Some(hickory_resolver::proto::ProtoErrorKind::NoRecordsFound {
221-
negative_ttl: Some(ttl_secs),
222-
..
223-
}) = error.proto().map(hickory_resolver::proto::ProtoError::kind)
224-
{
225-
return Some(time::Duration::from_secs(*ttl_secs as u64));
226-
}
216+
match srv_error {
217+
SrvRecordError::Resolve(srv_error) => Self::duration_from_error(srv_error),
218+
SrvRecordError::Invalid(_) => None,
219+
}
220+
}
221+
222+
/// Returns the negative TTL [`Duration`][time::Duration] of a [`ResolveError`].
223+
///
224+
/// This function will defensively check for TTL's of 0, and filter them out.
225+
fn duration_from_error(error: &hickory_resolver::ResolveError) -> Option<time::Duration> {
226+
use hickory_resolver::proto::{ProtoError, ProtoErrorKind};
227+
228+
let Some(ProtoErrorKind::NoRecordsFound {
229+
negative_ttl: Some(ttl_secs),
230+
..
231+
}) = error.proto().map(ProtoError::kind)
232+
else {
233+
return None;
234+
};
235+
236+
if *ttl_secs == 0 {
237+
tracing::warn!("received negative TTL of 0s");
238+
return None;
227239
}
228240

229-
None
241+
return Some(time::Duration::from_secs(*ttl_secs as u64));
230242
}
231243
}
232244

0 commit comments

Comments
 (0)