Skip to content

Commit fa6f257

Browse files
nlamiraultBen Clapp
authored andcommitted
Add: could customize Runbook
Signed-off-by: Nicolas Lamirault <nicolas.lamirault@gmail.com>
1 parent 4f57483 commit fa6f257

File tree

6 files changed

+139
-95
lines changed

6 files changed

+139
-95
lines changed

alerts/absent.libsonnet

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
prometheusAlerts+:: {
3+
groups+: [{
4+
name: 'cert-manager',
5+
rules: [
6+
{
7+
alert: 'CertManagerAbsent',
8+
expr: 'absent(up{job="%(certManagerJobLabel)s"})' % $._config,
9+
'for': '10m',
10+
labels: {
11+
severity: 'critical',
12+
},
13+
annotations: {
14+
summary: 'Cert Manager has dissapeared from Prometheus service discovery.',
15+
description: "New certificates will not be able to be minted, and existing ones can't be renewed until cert-manager is back.",
16+
// runbook_url: 'https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerabsent',
17+
},
18+
},
19+
],
20+
}],
21+
},
22+
}

alerts/add-runbook-links.libsonnet

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
local utils = import '../lib/utils.libsonnet';
2+
3+
local lower(x) =
4+
local cp(c) = std.codepoint(c);
5+
local lowerLetter(c) =
6+
if cp(c) >= 65 && cp(c) < 91
7+
then std.char(cp(c) + 32)
8+
else c;
9+
std.join('', std.map(lowerLetter, std.stringChars(x)));
10+
11+
{
12+
_config+:: {
13+
runbookURLPattern: 'https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#%s',
14+
},
15+
16+
prometheusAlerts+::
17+
local addRunbookURL(rule) = rule {
18+
[if 'alert' in rule && !('runbook_url' in rule.annotations) then 'annotations']+: {
19+
runbook_url: $._config.runbookURLPattern % lower(rule.alert),
20+
},
21+
};
22+
utils.mapRuleGroups(addRunbookURL),
23+
}

alerts/alerts.libsonnet

Lines changed: 3 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -1,90 +1,3 @@
1-
{
2-
prometheusAlerts+:: {
3-
groups+: [{
4-
name: 'cert-manager',
5-
rules: [
6-
{
7-
alert: 'CertManagerAbsent',
8-
expr: 'absent(up{job="%(certManagerJobLabel)s"})' % $._config,
9-
'for': '10m',
10-
labels: {
11-
severity: 'critical',
12-
},
13-
annotations: {
14-
summary: 'Cert Manager has dissapeared from Prometheus service discovery.',
15-
description: "New certificates will not be able to be minted, and existing ones can't be renewed until cert-manager is back.",
16-
runbook_url: 'https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerabsent',
17-
},
18-
},
19-
{
20-
alert: 'CertManagerCertExpirySoon',
21-
expr: |||
22-
avg by (exported_namespace, namespace, name) (
23-
certmanager_certificate_expiration_timestamp_seconds - time()
24-
) < (%s * 24 * 3600) # 21 days in seconds
25-
||| % $._config.certManagerCertExpiryDays,
26-
'for': '1h',
27-
labels: {
28-
severity: 'warning',
29-
},
30-
annotations: {
31-
summary: 'The cert `{{ $labels.name }}` is {{ $value | humanizeDuration }} from expiry, it should have renewed over a week ago.',
32-
description: 'The domain that this cert covers will be unavailable after {{ $value | humanizeDuration }}. Clients using endpoints that this cert protects will start to fail in {{ $value | humanizeDuration }}.',
33-
dashboard_url: $._config.grafanaExternalUrl + '/d/TvuRo2iMk/cert-manager',
34-
runbook_url: 'https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#CertManagerCertExpirySoon',
35-
},
36-
},
37-
{
38-
alert: 'CertManagerCertNotReady',
39-
expr: |||
40-
max by (name, exported_namespace, namespace, condition) (
41-
certmanager_certificate_ready_status{condition!="True"} == 1
42-
)
43-
|||,
44-
'for': '10m',
45-
labels: {
46-
severity: 'critical',
47-
},
48-
annotations: {
49-
summary: 'The cert `{{ $labels.name }}` is not ready to serve traffic.',
50-
description: 'This certificate has not been ready to serve traffic for at least 10m. If the cert is being renewed or there is another valid cert, the ingress controller _may_ be able to serve that instead.',
51-
dashboard_url: $._config.grafanaExternalUrl + '/d/TvuRo2iMk/cert-manager',
52-
runbook_url: 'https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#CertManagerCertNotReady',
53-
},
54-
},
55-
{
56-
alert: 'CertManagerCertExpiryMetricMissing',
57-
expr: 'absent(certmanager_certificate_expiration_timestamp_seconds)',
58-
'for': '10m',
59-
labels: {
60-
severity: 'info',
61-
},
62-
annotations: {
63-
summary: 'The metric used to observe cert-manager cert expiry is missing.',
64-
description: 'We are blind as to whether or not we can alert on certificates expiring. It could also be the case that there have not had any Certificate CRDs created.',
65-
dashboard_url: $._config.grafanaExternalUrl + '/d/TvuRo2iMk/cert-manager',
66-
runbook_url: 'https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#CertManagerCertExpiryMetricMissing',
67-
},
68-
},
69-
{
70-
alert: 'CertManagerHittingRateLimits',
71-
expr: |||
72-
sum by (host) (
73-
rate(certmanager_http_acme_client_request_count{status="429"}[5m])
74-
) > 0
75-
|||,
76-
'for': '5m',
77-
labels: {
78-
severity: 'critical',
79-
},
80-
annotations: {
81-
summary: 'Cert manager hitting LetsEncrypt rate limits.',
82-
description: 'Depending on the rate limit, cert-manager may be unable to generate certificates for up to a week.',
83-
dashboard_url: $._config.grafanaExternalUrl + '/d/TvuRo2iMk/cert-manager',
84-
runbook_url: 'https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#CertManagerHittingRateLimits',
85-
},
86-
},
87-
],
88-
}],
89-
},
90-
}
1+
(import 'absent.libsonnet') +
2+
(import 'certificates.libsonnet') +
3+
(import 'add-runbook-links.libsonnet')

alerts/certificates.libsonnet

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
{
2+
prometheusAlerts+:: {
3+
groups+: [{
4+
name: 'certificates',
5+
rules: [
6+
{
7+
alert: 'CertManagerCertExpirySoon',
8+
expr: |||
9+
avg by (exported_namespace, namespace, name) (
10+
certmanager_certificate_expiration_timestamp_seconds - time()
11+
) < (%s * 24 * 3600) # 21 days in seconds
12+
||| % $._config.certManagerCertExpiryDays,
13+
'for': '1h',
14+
labels: {
15+
severity: 'warning',
16+
},
17+
annotations: {
18+
summary: 'The cert `{{ $labels.name }}` is {{ $value | humanizeDuration }} from expiry, it should have renewed over a week ago.',
19+
description: 'The domain that this cert covers will be unavailable after {{ $value | humanizeDuration }}. Clients using endpoints that this cert protects will start to fail in {{ $value | humanizeDuration }}.',
20+
dashboard_url: $._config.grafanaExternalUrl + '/d/TvuRo2iMk/cert-manager',
21+
},
22+
},
23+
{
24+
alert: 'CertManagerCertNotReady',
25+
expr: |||
26+
max by (name, exported_namespace, namespace, condition) (
27+
certmanager_certificate_ready_status{condition!="True"} == 1
28+
)
29+
|||,
30+
'for': '10m',
31+
labels: {
32+
severity: 'critical',
33+
},
34+
annotations: {
35+
summary: 'The cert `{{ $labels.name }}` is not ready to serve traffic.',
36+
description: 'This certificate has not been ready to serve traffic for at least 10m. If the cert is being renewed or there is another valid cert, the ingress controller _may_ be able to serve that instead.',
37+
dashboard_url: $._config.grafanaExternalUrl + '/d/TvuRo2iMk/cert-manager',
38+
},
39+
},
40+
{
41+
alert: 'CertManagerCertExpiryMetricMissing',
42+
expr: 'absent(certmanager_certificate_expiration_timestamp_seconds)',
43+
'for': '10m',
44+
labels: {
45+
severity: 'info',
46+
},
47+
annotations: {
48+
summary: 'The metric used to observe cert-manager cert expiry is missing.',
49+
description: 'We are blind as to whether or not we can alert on certificates expiring. It could also be the case that there have not had any Certificate CRDs created.',
50+
dashboard_url: $._config.grafanaExternalUrl + '/d/TvuRo2iMk/cert-manager',
51+
},
52+
},
53+
{
54+
alert: 'CertManagerHittingRateLimits',
55+
expr: |||
56+
sum by (host) (
57+
rate(certmanager_http_acme_client_request_count{status="429"}[5m])
58+
) > 0
59+
|||,
60+
'for': '5m',
61+
labels: {
62+
severity: 'critical',
63+
},
64+
annotations: {
65+
summary: 'Cert manager hitting LetsEncrypt rate limits.',
66+
description: 'Depending on the rate limit, cert-manager may be unable to generate certificates for up to a week.',
67+
dashboard_url: $._config.grafanaExternalUrl + '/d/TvuRo2iMk/cert-manager',
68+
},
69+
},
70+
],
71+
}],
72+
},
73+
}

lib/utils.libsonnet

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
mapRuleGroups(f): {
3+
groups: [
4+
group {
5+
rules: [
6+
f(rule)
7+
for rule in super.rules
8+
],
9+
}
10+
for group in super.groups
11+
],
12+
},
13+
}

tests.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ tests:
2929
summary: The metric used to observe cert-manager cert expiry is missing.
3030
description: 'We are blind as to whether or not we can alert on certificates expiring. It could also be the case that there have not had any Certificate CRDs created.'
3131
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
32-
runbook_url: 'https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#CertManagerCertExpiryMetricMissing'
32+
runbook_url: 'https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertexpirymetricmissing'
3333

3434
# Cert expiry
3535
- interval: 1m
@@ -51,7 +51,7 @@ tests:
5151
summary: The cert `expired-ingress-cert` is 20d 22h 59m 0s from expiry, it should have renewed over a week ago.
5252
description: 'The domain that this cert covers will be unavailable after 20d 22h 59m 0s. Clients using endpoints that this cert protects will start to fail in 20d 22h 59m 0s.'
5353
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
54-
runbook_url: 'https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#CertManagerCertExpirySoon'
54+
runbook_url: 'https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertexpirysoon'
5555

5656
# Cert not ready
5757
- interval: 1m
@@ -77,7 +77,7 @@ tests:
7777
description: 'This certificate has not been ready to serve traffic for at least 10m. If the cert is being renewed or there is another valid cert,
7878
the ingress controller _may_ be able to serve that instead.'
7979
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
80-
runbook_url: 'https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#CertManagerCertNotReady'
80+
runbook_url: 'https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertnotready'
8181
- exp_labels:
8282
severity: critical
8383
exported_namespace: test
@@ -89,7 +89,7 @@ tests:
8989
description: 'This certificate has not been ready to serve traffic for at least 10m. If the cert is being renewed or there is another valid cert,
9090
the ingress controller _may_ be able to serve that instead.'
9191
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
92-
runbook_url: 'https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#CertManagerCertNotReady'
92+
runbook_url: 'https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertnotready'
9393

9494
# cert-manager rate limits
9595
- interval: 1m
@@ -110,5 +110,5 @@ tests:
110110
exp_annotations:
111111
summary: 'Cert manager hitting LetsEncrypt rate limits.'
112112
description: 'Depending on the rate limit, cert-manager may be unable to generate certificates for up to a week.'
113-
runbook_url: 'https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#CertManagerHittingRateLimits'
113+
runbook_url: 'https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerhittingratelimits'
114114
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager

0 commit comments

Comments
 (0)