diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 97d72f40ad..bdc8e26b14 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -193,6 +193,8 @@ image_prometheus: .dkr.ecr..amazonaws.com/cortexlabs/prometh image_prometheus_config_reloader: .dkr.ecr..amazonaws.com/cortexlabs/prometheus-config-reloader:master image_prometheus_operator: .dkr.ecr..amazonaws.com/cortexlabs/prometheus-operator:master image_prometheus_statsd_exporter: .dkr.ecr..amazonaws.com/cortexlabs/prometheus-statsd-exporter:master +image_prometheus_node_exporter: .dkr.ecr..amazonaws.com/cortexlabs/prometheus-node-exporter:master +image_kube_rbac_proxy: .dkr.ecr..amazonaws.com/cortexlabs/kube-rbac-proxy:master image_grafana: .dkr.ecr..amazonaws.com/cortexlabs/grafana:master image_event_exporter: .dkr.ecr..amazonaws.com/cortexlabs/event-exporter:master ``` @@ -222,6 +224,8 @@ image_prometheus: gcr.io//cortexlabs/prometheus:master image_prometheus_config_reloader: gcr.io//cortexlabs/prometheus-config-reloader:master image_prometheus_operator: gcr.io//cortexlabs/prometheus-operator:master image_prometheus_statsd_exporter: gcr.io//cortexlabs/prometheus-statsd-exporter:master +image_prometheus_node_exporter: gcr.io//cortexlabs/prometheus-node-exporter:master +image_kube_rbac_proxy: gcr.io//cortexlabs/kube-rbac-proxy:master image_grafana: gcr.io//cortexlabs/grafana:master image_event_exporter: gcr.io//cortexlabs/event-exporter:master ``` diff --git a/build/images.sh b/build/images.sh index 480562708a..95cbd7fc3e 100644 --- a/build/images.sh +++ b/build/images.sh @@ -58,6 +58,8 @@ non_dev_images_cluster=( "prometheus-config-reloader" "prometheus-operator" "prometheus-statsd-exporter" + "prometheus-node-exporter" + "kube-rbac-proxy" "grafana" "event-exporter" ) diff --git a/charts/dashboards/cluster.json b/charts/dashboards/cluster.json new file mode 100644 index 0000000000..0f7b9035bd --- /dev/null +++ b/charts/dashboards/cluster.json @@ -0,0 +1,1092 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "prometheus", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 14, + "links": [], + "panels": [ + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 16, + "options": { + "content": "

Cluster

\n", + "mode": "html" + }, + "pluginVersion": "7.4.0", + "timeFrom": null, + "timeShift": null, + "title": "", + "transparent": true, + "type": "text" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 2 + }, + "id": 10, + "panels": [], + "repeat": null, + "title": "CPU", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 3 + }, + "hiddenSeries": false, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "(\n instance:node_cpu_utilisation:rate1m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\"}))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 3 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}\n/ scalar(count(instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Saturation (load1 per CPU)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 11, + "panels": [], + "repeat": null, + "title": "Memory", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 11 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance:node_memory_utilisation:ratio{job=\"node-exporter\"}\n/ scalar(count(instance:node_memory_utilisation:ratio{job=\"node-exporter\"}))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 11 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance:node_vmstat_pgmajfault:rate1m{job=\"node-exporter\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Saturation (Major Page Faults)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "rps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 12, + "panels": [], + "repeat": null, + "title": "Network", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 19 + }, + "hiddenSeries": false, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/ Receive/", + "stack": "A" + }, + { + "alias": "/ Transmit/", + "stack": "B", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance:node_network_receive_bytes_excluding_lo:rate1m{job=\"node-exporter\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Receive", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "A", + "step": 10 + }, + { + "expr": "instance:node_network_transmit_bytes_excluding_lo:rate1m{job=\"node-exporter\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Transmit", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Net Utilisation (Bytes Receive/Transmit)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 19 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/ Receive/", + "stack": "A" + }, + { + "alias": "/ Transmit/", + "stack": "B", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance:node_network_receive_drop_excluding_lo:rate1m{job=\"node-exporter\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Receive", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "A", + "step": 10 + }, + { + "expr": "instance:node_network_transmit_drop_excluding_lo:rate1m{job=\"node-exporter\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Transmit", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Net Saturation (Drops Receive/Transmit)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "rps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 13, + "panels": [], + "repeat": null, + "title": "Disk IO", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 27 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{device}}", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IO Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 27 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{device}}", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IO Saturation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 14, + "panels": [], + "repeat": null, + "title": "Disk Space", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 35 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum without (device) (\n max without (fstype, mountpoint) (\n node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"} - node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\"}\n )\n) \n/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"})))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk Space Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "1m", + "schemaVersion": 27, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-12h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Cluster", + "uid": "cluster", + "version": 1 +} diff --git a/charts/dashboards/nodes.json b/charts/dashboards/nodes.json new file mode 100644 index 0000000000..146947fd10 --- /dev/null +++ b/charts/dashboards/nodes.json @@ -0,0 +1,992 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "prometheus", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 12, + "iteration": 1613734078062, + "links": [], + "panels": [ + { + "datasource": null, + "description": "", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 11, + "options": { + "content": "

Nodes

\n", + "mode": "html" + }, + "pluginVersion": "7.4.0", + "timeFrom": null, + "timeShift": null, + "title": "", + "transparent": true, + "type": "text" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 2 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "(\n (1 - rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[$__interval]))\n/ ignoring(cpu) group_left\n count without (cpu)( node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"})\n)\n", + "format": "time_series", + "interval": "1m", + "intervalFactor": 5, + "legendFormat": "{{cpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 2 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "1m load average", + "refId": "A" + }, + { + "expr": "node_load5{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "5m load average", + "refId": "B" + }, + { + "expr": "node_load15{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "15m load average", + "refId": "C" + }, + { + "expr": "count(node_cpu_seconds_total{job=\"node-exporter\", instance=\"$instance\", mode=\"idle\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "logical cores", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Load Average", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 18, + "x": 0, + "y": 9 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "(\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n-\n node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n-\n node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n-\n node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory used", + "refId": "A" + }, + { + "expr": "node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "refId": "B" + }, + { + "expr": "node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory cached", + "refId": "C" + }, + { + "expr": "node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory free", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 9 + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "100 -\n(\n avg(node_memory_MemAvailable_bytes{job=\"node-exporter\", instance=\"$instance\"})\n/\n avg(node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"})\n* 100\n)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "80, 90", + "title": "Memory Usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 16 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/ read| written/", + "yaxis": 1 + }, + { + "alias": "/ io time/", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "{{device}} read", + "refId": "A" + }, + { + "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "{{device}} written", + "refId": "B" + }, + { + "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "{{device}} io time", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 16 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "used", + "color": "#E0B400" + }, + { + "alias": "available", + "color": "#73BF69" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "used", + "refId": "A" + }, + { + "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "available", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk Space Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 23 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Received", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 23 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Transmitted", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "1m", + "schemaVersion": 27, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "instance", + "options": [], + "query": "label_values(node_exporter_build_info{job=\"node-exporter\"}, instance)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-12h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Nodes", + "uid": "nodes", + "version": 3 +} diff --git a/charts/templates/grafana.yaml b/charts/templates/grafana.yaml index eb65e707cf..67b8100ca4 100644 --- a/charts/templates/grafana.yaml +++ b/charts/templates/grafana.yaml @@ -85,6 +85,28 @@ data: --- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-cluster + namespace: {{ .Release.Namespace }} +data: + cluster.json: |- +{{ .Files.Get "dashboards/cluster.json" | indent 4 }} + +--- + +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-nodes + namespace: {{ .Release.Namespace }} +data: + nodes.json: |- +{{ .Files.Get "dashboards/nodes.json" | indent 4 }} + +--- + apiVersion: v1 kind: PersistentVolumeClaim metadata: @@ -158,6 +180,12 @@ spec: - mountPath: /grafana-dashboard-definitions/cortex/batch name: grafana-dashboard-batch readOnly: false + - mountPath: /grafana-dashboard-definitions/cortex/cluster + name: grafana-dashboard-cluster + readOnly: false + - mountPath: /grafana-dashboard-definitions/cortex/nodes + name: grafana-dashboard-nodes + readOnly: false securityContext: fsGroup: 65534 runAsNonRoot: true @@ -178,6 +206,12 @@ spec: - name: grafana-dashboard-batch configMap: name: grafana-dashboard-batch + - name: grafana-dashboard-cluster + configMap: + name: grafana-dashboard-cluster + - name: grafana-dashboard-nodes + configMap: + name: grafana-dashboard-nodes affinity: podAffinity: preferredDuringSchedulingIgnoredDuringExecution: diff --git a/charts/templates/prometheus-monitoring.yaml b/charts/templates/prometheus-monitoring.yaml index 88fbd26775..18a0a9e11b 100644 --- a/charts/templates/prometheus-monitoring.yaml +++ b/charts/templates/prometheus-monitoring.yaml @@ -28,6 +28,14 @@ spec: - key: "monitoring.cortex.dev" operator: "In" values: [ "istio", "request-monitor", "statsd-exporter" ] + serviceMonitorSelector: + matchExpressions: + - key: "monitoring.cortex.dev" + operator: "In" + values: [ "node-exporter" ] + ruleSelector: + matchLabels: + prometheus: k8s resources: requests: memory: 400Mi diff --git a/charts/templates/prometheus-node-exporter.yaml b/charts/templates/prometheus-node-exporter.yaml new file mode 100644 index 0000000000..13bb7d9d76 --- /dev/null +++ b/charts/templates/prometheus-node-exporter.yaml @@ -0,0 +1,257 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-exporter + namespace: {{ .Release.Namespace }} + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: node-exporter +rules: + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: node-exporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: node-exporter +subjects: + - kind: ServiceAccount + name: node-exporter + namespace: {{ .Release.Namespace }} + +--- + +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: v0.18.1 + name: node-exporter + namespace: {{ .Release.Namespace }} +spec: + clusterIP: None + ports: + - name: https + port: 9100 + targetPort: https + selector: + app.kubernetes.io/name: node-exporter + +--- + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: v0.18.1 + name: node-exporter + namespace: {{ .Release.Namespace }} +spec: + selector: + matchLabels: + app.kubernetes.io/name: node-exporter + template: + metadata: + labels: + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: v0.18.1 + spec: + containers: + - args: + - --web.listen-address=127.0.0.1:9100 + - --path.procfs=/host/proc + - --path.sysfs=/host/sys + - --path.rootfs=/host/root + - --no-collector.wifi + - --no-collector.hwmon + - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) + image: {{ .Values.cortex.image_prometheus_node_exporter | quote }} + name: node-exporter + resources: + limits: + cpu: 250m + memory: 180Mi + requests: + cpu: 100m + memory: 180Mi + volumeMounts: + - mountPath: /host/proc + name: proc + readOnly: false + - mountPath: /host/sys + name: sys + readOnly: false + - mountPath: /host/root + mountPropagation: HostToContainer + name: root + readOnly: true + - args: + - --logtostderr + - --secure-listen-address=[$(IP)]:9100 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=http://127.0.0.1:9100/ + env: + - name: IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: {{ .Values.cortex.image_kube_rbac_proxy }} + name: kube-rbac-proxy + ports: + - containerPort: 9100 + hostPort: 9100 + name: https + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + hostNetwork: true + hostPID: true + nodeSelector: + kubernetes.io/os: linux + securityContext: + runAsNonRoot: true + runAsUser: 65534 + serviceAccountName: node-exporter + tolerations: + - operator: Exists + volumes: + - hostPath: + path: /proc + name: proc + - hostPath: + path: /sys + name: sys + - hostPath: + path: / + name: root + updateStrategy: + rollingUpdate: + maxUnavailable: 10% + +--- + +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: v0.18.1 + monitoring.cortex.dev: node-exporter + name: node-exporter + namespace: {{ .Release.Namespace }} +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 15s + port: https + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: instance + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: app.kubernetes.io/name + selector: + matchLabels: + app.kubernetes.io/name: node-exporter + +--- + +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: 1.1.0 + prometheus: k8s + name: node-exporter-rules + namespace: {{ .Release.Namespace }} +spec: + groups: + - name: node-exporter.rules + rules: + - expr: | + count without (cpu) ( + count without (mode) ( + node_cpu_seconds_total{job="node-exporter"} + ) + ) + record: instance:node_num_cpu:sum + - expr: | + 1 - avg without (cpu, mode) ( + rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m]) + ) + record: instance:node_cpu_utilisation:rate1m + - expr: | + ( + node_load1{job="node-exporter"} + / + instance:node_num_cpu:sum{job="node-exporter"} + ) + record: instance:node_load1_per_cpu:ratio + - expr: | + 1 - ( + node_memory_MemAvailable_bytes{job="node-exporter"} + / + node_memory_MemTotal_bytes{job="node-exporter"} + ) + record: instance:node_memory_utilisation:ratio + - expr: | + rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) + record: instance:node_vmstat_pgmajfault:rate1m + - expr: | + rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) + record: instance_device:node_disk_io_time_seconds:rate1m + - expr: | + rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) + record: instance_device:node_disk_io_time_weighted_seconds:rate1m + - expr: | + sum without (device) ( + rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_receive_bytes_excluding_lo:rate1m + - expr: | + sum without (device) ( + rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_transmit_bytes_excluding_lo:rate1m + - expr: | + sum without (device) ( + rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_receive_drop_excluding_lo:rate1m + - expr: | + sum without (device) ( + rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_transmit_drop_excluding_lo:rate1m diff --git a/charts/values.yaml b/charts/values.yaml index 6a88bd9aba..2548d81d03 100644 --- a/charts/values.yaml +++ b/charts/values.yaml @@ -28,6 +28,8 @@ cortex: image_prometheus_config_reloader: quay.io/cortexlabs/prometheus-config-reloader:master image_prometheus_operator: quay.io/cortexlabs/prometheus-operator:master image_prometheus_statsd_exporter: quay.io/cortexlabs/prometheus-statsd-exporter:master + image_prometheus_node_exporter: quay.io/cortexlabs/prometheus-node-exporter:master + image_kube_rbac_proxy: quay.io/cortexlabs/kube-rbac-proxy:master image_grafana: quay.io/cortexlabs/grafana:master image_event_exporter: quay.io/cortexlabs/event-exporter:master diff --git a/cli/cmd/lib_cluster_config_aws.go b/cli/cmd/lib_cluster_config_aws.go index 3975dab6fa..4617ea2369 100644 --- a/cli/cmd/lib_cluster_config_aws.go +++ b/cli/cmd/lib_cluster_config_aws.go @@ -424,6 +424,14 @@ func setConfigFieldsFromCached(userClusterConfig *clusterconfig.Config, cachedCl return clusterconfig.ErrorConfigCannotBeChangedOnUpdate(clusterconfig.ImagePrometheusStatsDExporterKey, cachedClusterConfig.ImagePrometheusStatsDExporter) } + if s.Obj(cachedClusterConfig.ImagePrometheusNodeExporter) != s.Obj(userClusterConfig.ImagePrometheusNodeExporter) { + return clusterconfig.ErrorConfigCannotBeChangedOnUpdate(clusterconfig.ImagePrometheusNodeExporterKey, cachedClusterConfig.ImagePrometheusNodeExporter) + } + + if s.Obj(cachedClusterConfig.ImageKubeRBACProxy) != s.Obj(userClusterConfig.ImageKubeRBACProxy) { + return clusterconfig.ErrorConfigCannotBeChangedOnUpdate(clusterconfig.ImageKubeRBACProxyKey, cachedClusterConfig.ImageKubeRBACProxy) + } + if s.Obj(cachedClusterConfig.ImageGrafana) != s.Obj(userClusterConfig.ImageGrafana) { return clusterconfig.ErrorConfigCannotBeChangedOnUpdate(clusterconfig.ImageGrafanaKey, cachedClusterConfig.ImageGrafana) } diff --git a/dev/versions.md b/dev/versions.md index e74b823587..902af66338 100644 --- a/dev/versions.md +++ b/dev/versions.md @@ -323,6 +323,14 @@ supported () 1. Update the base image version in `images/prometheus-statsd-exporter/Dockerfile`. 1. Update `prometheus-statsd-exporter.yaml` as necessary, if that's the case. +## Prometheus Node Exporter + +1. Find the latest release in the Kube Prometheus [GitHub Repo](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/). +1. Copy the `node-exporter-*.yaml` files contents into `prometheus-node-exporter.yaml`, but keep the prometheus rules resource. +1. Replace the image in the Deployment resource with a cortex env var. +1. Update the base image versions in `images/prometheus-node-exporter/Dockerfile` + and `images/kube-rbac-proxy/Dockerfile`. + ## Grafana 1. Find the latest release diff --git a/docs/clusters/aws/install.md b/docs/clusters/aws/install.md index b3a76cca4f..1532f598aa 100644 --- a/docs/clusters/aws/install.md +++ b/docs/clusters/aws/install.md @@ -108,6 +108,8 @@ image_prometheus: quay.io/cortexlabs/prometheus:master image_prometheus_config_reloader: quay.io/cortexlabs/prometheus-config-reloader:master image_prometheus_operator: quay.io/cortexlabs/prometheus-operator:master image_prometheus_statsd_exporter: quay.io/cortexlabs/prometheus-statsd-exporter:master +image_prometheus_node_exporter: quay.io/cortexlabs/prometheus-node-exporter:master +image_kube_rbac_proxy: quay.io/cortexlabs/kube-rbac-proxy:master image_grafana: quay.io/cortexlabs/grafana:master image_event_exporter: quay.io/cortexlabs/event-exporter:master ``` diff --git a/docs/clusters/gcp/install.md b/docs/clusters/gcp/install.md index 4927d64971..be3e043b22 100644 --- a/docs/clusters/gcp/install.md +++ b/docs/clusters/gcp/install.md @@ -82,6 +82,8 @@ image_prometheus: quay.io/cortexlabs/prometheus:master image_prometheus_config_reloader: quay.io/cortexlabs/prometheus-config-reloader:master image_prometheus_operator: quay.io/cortexlabs/prometheus-operator:master image_prometheus_statsd_exporter: quay.io/cortexlabs/prometheus-statsd-exporter:master +image_prometheus_node_exporter: quay.io/cortexlabs/prometheus-node-exporter:master +image_kube_rbac_proxy: quay.io/cortexlabs/kube-rbac-proxy:master image_grafana: quay.io/cortexlabs/grafana:master image_event_exporter: quay.io/cortexlabs/event-exporter:master ``` diff --git a/images/grafana/Dockerfile b/images/grafana/Dockerfile index 50f9cb3064..dd60306c63 100644 --- a/images/grafana/Dockerfile +++ b/images/grafana/Dockerfile @@ -1 +1 @@ -FROM grafana/grafana:7.4.0 +FROM grafana/grafana:7.4.2 diff --git a/images/kube-rbac-proxy/Dockerfile b/images/kube-rbac-proxy/Dockerfile new file mode 100644 index 0000000000..1ffeaecb1d --- /dev/null +++ b/images/kube-rbac-proxy/Dockerfile @@ -0,0 +1 @@ +FROM quay.io/coreos/kube-rbac-proxy:v0.4.1 diff --git a/images/prometheus-node-exporter/Dockerfile b/images/prometheus-node-exporter/Dockerfile new file mode 100644 index 0000000000..98a514b52f --- /dev/null +++ b/images/prometheus-node-exporter/Dockerfile @@ -0,0 +1 @@ +FROM quay.io/prometheus/node-exporter:v0.18.1 diff --git a/manager/install.sh b/manager/install.sh index c6f744fc27..5a9917f16c 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -303,12 +303,15 @@ function setup_secrets() { function setup_prometheus() { envsubst < manifests/prometheus-operator.yaml | kubectl apply -f - >/dev/null envsubst < manifests/prometheus-statsd-exporter.yaml | kubectl apply -f - >/dev/null + envsubst < manifests/prometheus-node-exporter.yaml | kubectl apply -f - >/dev/null python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/prometheus-monitoring.yaml.j2 | kubectl apply -f - >/dev/null } function setup_grafana() { kubectl apply -f manifests/grafana/grafana-dashboard-realtime.yaml >/dev/null kubectl apply -f manifests/grafana/grafana-dashboard-batch.yaml >/dev/null + kubectl apply -f manifests/grafana/grafana-dashboard-cluster.yaml >/dev/null + kubectl apply -f manifests/grafana/grafana-dashboard-nodes.yaml >/dev/null envsubst < manifests/grafana/grafana.yaml | kubectl apply -f - >/dev/null } diff --git a/manager/manifests/grafana/grafana-dashboard-cluster.yaml b/manager/manifests/grafana/grafana-dashboard-cluster.yaml new file mode 100644 index 0000000000..c17d0f7af2 --- /dev/null +++ b/manager/manifests/grafana/grafana-dashboard-cluster.yaml @@ -0,0 +1,1113 @@ +# Copyright 2021 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-cluster + namespace: default +data: + cluster.json: | + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "prometheus", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 14, + "links": [], + "panels": [ + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 16, + "options": { + "content": "

Cluster

\n", + "mode": "html" + }, + "pluginVersion": "7.4.0", + "timeFrom": null, + "timeShift": null, + "title": "", + "transparent": true, + "type": "text" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 2 + }, + "id": 10, + "panels": [], + "repeat": null, + "title": "CPU", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 3 + }, + "hiddenSeries": false, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "(\n instance:node_cpu_utilisation:rate1m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\"}))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 3 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}\n/ scalar(count(instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Saturation (load1 per CPU)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 11, + "panels": [], + "repeat": null, + "title": "Memory", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 11 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance:node_memory_utilisation:ratio{job=\"node-exporter\"}\n/ scalar(count(instance:node_memory_utilisation:ratio{job=\"node-exporter\"}))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 11 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance:node_vmstat_pgmajfault:rate1m{job=\"node-exporter\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Saturation (Major Page Faults)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "rps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 12, + "panels": [], + "repeat": null, + "title": "Network", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 19 + }, + "hiddenSeries": false, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/ Receive/", + "stack": "A" + }, + { + "alias": "/ Transmit/", + "stack": "B", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance:node_network_receive_bytes_excluding_lo:rate1m{job=\"node-exporter\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Receive", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "A", + "step": 10 + }, + { + "expr": "instance:node_network_transmit_bytes_excluding_lo:rate1m{job=\"node-exporter\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Transmit", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Net Utilisation (Bytes Receive/Transmit)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 19 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/ Receive/", + "stack": "A" + }, + { + "alias": "/ Transmit/", + "stack": "B", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance:node_network_receive_drop_excluding_lo:rate1m{job=\"node-exporter\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Receive", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "A", + "step": 10 + }, + { + "expr": "instance:node_network_transmit_drop_excluding_lo:rate1m{job=\"node-exporter\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Transmit", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Net Saturation (Drops Receive/Transmit)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "rps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 13, + "panels": [], + "repeat": null, + "title": "Disk IO", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 27 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{device}}", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IO Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 27 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{device}}", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IO Saturation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 14, + "panels": [], + "repeat": null, + "title": "Disk Space", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 35 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum without (device) (\n max without (fstype, mountpoint) (\n node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"} - node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\"}\n )\n) \n/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"})))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "legendLink": "/dashboard/file/node-rsrc-use.json", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk Space Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "1m", + "schemaVersion": 27, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-12h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Cluster", + "uid": "cluster", + "version": 1 + } diff --git a/manager/manifests/grafana/grafana-dashboard-nodes.yaml b/manager/manifests/grafana/grafana-dashboard-nodes.yaml new file mode 100644 index 0000000000..a5bc6c16dc --- /dev/null +++ b/manager/manifests/grafana/grafana-dashboard-nodes.yaml @@ -0,0 +1,1013 @@ +# Copyright 2021 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-nodes + namespace: default +data: + nodes.json: | + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "prometheus", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 12, + "iteration": 1613734078062, + "links": [], + "panels": [ + { + "datasource": null, + "description": "", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 11, + "options": { + "content": "

Nodes

\n", + "mode": "html" + }, + "pluginVersion": "7.4.0", + "timeFrom": null, + "timeShift": null, + "title": "", + "transparent": true, + "type": "text" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 2 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "(\n (1 - rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[$__interval]))\n/ ignoring(cpu) group_left\n count without (cpu)( node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"})\n)\n", + "format": "time_series", + "interval": "1m", + "intervalFactor": 5, + "legendFormat": "{{cpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 2 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "1m load average", + "refId": "A" + }, + { + "expr": "node_load5{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "5m load average", + "refId": "B" + }, + { + "expr": "node_load15{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "15m load average", + "refId": "C" + }, + { + "expr": "count(node_cpu_seconds_total{job=\"node-exporter\", instance=\"$instance\", mode=\"idle\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "logical cores", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Load Average", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 18, + "x": 0, + "y": 9 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "(\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n-\n node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n-\n node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n-\n node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory used", + "refId": "A" + }, + { + "expr": "node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "refId": "B" + }, + { + "expr": "node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory cached", + "refId": "C" + }, + { + "expr": "node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory free", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 9 + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "100 -\n(\n avg(node_memory_MemAvailable_bytes{job=\"node-exporter\", instance=\"$instance\"})\n/\n avg(node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"})\n* 100\n)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "80, 90", + "title": "Memory Usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 16 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/ read| written/", + "yaxis": 1 + }, + { + "alias": "/ io time/", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "{{device}} read", + "refId": "A" + }, + { + "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "{{device}} written", + "refId": "B" + }, + { + "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "{{device}} io time", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 16 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "used", + "color": "#E0B400" + }, + { + "alias": "available", + "color": "#73BF69" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "used", + "refId": "A" + }, + { + "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "available", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk Space Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 23 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Received", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 23 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.0", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Transmitted", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "1m", + "schemaVersion": 27, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "instance", + "options": [], + "query": "label_values(node_exporter_build_info{job=\"node-exporter\"}, instance)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-12h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Nodes", + "uid": "nodes", + "version": 3 + } diff --git a/manager/manifests/grafana/grafana.yaml b/manager/manifests/grafana/grafana.yaml index b7f7c41056..6c5b995371 100644 --- a/manager/manifests/grafana/grafana.yaml +++ b/manager/manifests/grafana/grafana.yaml @@ -135,6 +135,12 @@ spec: - mountPath: /grafana-dashboard-definitions/cortex/batch name: grafana-dashboard-batch readOnly: false + - mountPath: /grafana-dashboard-definitions/cortex/cluster + name: grafana-dashboard-cluster + readOnly: false + - mountPath: /grafana-dashboard-definitions/cortex/nodes + name: grafana-dashboard-nodes + readOnly: false securityContext: fsGroup: 65534 runAsNonRoot: true @@ -155,6 +161,12 @@ spec: - name: grafana-dashboard-batch configMap: name: grafana-dashboard-batch + - name: grafana-dashboard-cluster + configMap: + name: grafana-dashboard-cluster + - name: grafana-dashboard-nodes + configMap: + name: grafana-dashboard-nodes affinity: podAffinity: preferredDuringSchedulingIgnoredDuringExecution: diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml.j2 index ef578ca26c..eb5b2d4c21 100644 --- a/manager/manifests/prometheus-monitoring.yaml.j2 +++ b/manager/manifests/prometheus-monitoring.yaml.j2 @@ -41,6 +41,14 @@ spec: - key: "monitoring.cortex.dev" operator: "In" values: [ "istio", "request-monitor", "statsd-exporter" ] + serviceMonitorSelector: + matchExpressions: + - key: "monitoring.cortex.dev" + operator: "In" + values: [ "node-exporter" ] + ruleSelector: + matchLabels: + prometheus: k8s resources: requests: memory: 400Mi diff --git a/manager/manifests/prometheus-node-exporter.yaml b/manager/manifests/prometheus-node-exporter.yaml new file mode 100644 index 0000000000..c1a6b0b4a0 --- /dev/null +++ b/manager/manifests/prometheus-node-exporter.yaml @@ -0,0 +1,271 @@ +# Copyright 2021 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-exporter + namespace: default + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: node-exporter +rules: + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: node-exporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: node-exporter +subjects: + - kind: ServiceAccount + name: node-exporter + namespace: default + +--- + +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: v0.18.1 + name: node-exporter + namespace: default +spec: + clusterIP: None + ports: + - name: https + port: 9100 + targetPort: https + selector: + app.kubernetes.io/name: node-exporter + +--- + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: v0.18.1 + name: node-exporter + namespace: default +spec: + selector: + matchLabels: + app.kubernetes.io/name: node-exporter + template: + metadata: + labels: + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: v0.18.1 + spec: + containers: + - args: + - --web.listen-address=127.0.0.1:9100 + - --path.procfs=/host/proc + - --path.sysfs=/host/sys + - --path.rootfs=/host/root + - --no-collector.wifi + - --no-collector.hwmon + - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) + image: $CORTEX_IMAGE_PROMETHEUS_NODE_EXPORTER + name: node-exporter + resources: + limits: + cpu: 250m + memory: 180Mi + requests: + cpu: 100m + memory: 180Mi + volumeMounts: + - mountPath: /host/proc + name: proc + readOnly: false + - mountPath: /host/sys + name: sys + readOnly: false + - mountPath: /host/root + mountPropagation: HostToContainer + name: root + readOnly: true + - args: + - --logtostderr + - --secure-listen-address=[$(IP)]:9100 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=http://127.0.0.1:9100/ + env: + - name: IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: $CORTEX_IMAGE_KUBE_RBAC_PROXY + name: kube-rbac-proxy + ports: + - containerPort: 9100 + hostPort: 9100 + name: https + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + hostNetwork: true + hostPID: true + nodeSelector: + kubernetes.io/os: linux + securityContext: + runAsNonRoot: true + runAsUser: 65534 + serviceAccountName: node-exporter + tolerations: + - operator: Exists + volumes: + - hostPath: + path: /proc + name: proc + - hostPath: + path: /sys + name: sys + - hostPath: + path: / + name: root + updateStrategy: + rollingUpdate: + maxUnavailable: 10% + +--- + +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: v0.18.1 + monitoring.cortex.dev: node-exporter + name: node-exporter + namespace: default +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 15s + port: https + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: instance + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: app.kubernetes.io/name + selector: + matchLabels: + app.kubernetes.io/name: node-exporter + +--- + +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: 1.1.0 + prometheus: k8s + name: node-exporter-rules + namespace: default +spec: + groups: + - name: node-exporter.rules + rules: + - expr: | + count without (cpu) ( + count without (mode) ( + node_cpu_seconds_total{job="node-exporter"} + ) + ) + record: instance:node_num_cpu:sum + - expr: | + 1 - avg without (cpu, mode) ( + rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m]) + ) + record: instance:node_cpu_utilisation:rate1m + - expr: | + ( + node_load1{job="node-exporter"} + / + instance:node_num_cpu:sum{job="node-exporter"} + ) + record: instance:node_load1_per_cpu:ratio + - expr: | + 1 - ( + node_memory_MemAvailable_bytes{job="node-exporter"} + / + node_memory_MemTotal_bytes{job="node-exporter"} + ) + record: instance:node_memory_utilisation:ratio + - expr: | + rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) + record: instance:node_vmstat_pgmajfault:rate1m + - expr: | + rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) + record: instance_device:node_disk_io_time_seconds:rate1m + - expr: | + rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) + record: instance_device:node_disk_io_time_weighted_seconds:rate1m + - expr: | + sum without (device) ( + rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_receive_bytes_excluding_lo:rate1m + - expr: | + sum without (device) ( + rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_transmit_bytes_excluding_lo:rate1m + - expr: | + sum without (device) ( + rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_receive_drop_excluding_lo:rate1m + - expr: | + sum without (device) ( + rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_transmit_drop_excluding_lo:rate1m diff --git a/pkg/operator/resources/validations.go b/pkg/operator/resources/validations.go index d9d075b88c..2bd6d9abc9 100644 --- a/pkg/operator/resources/validations.go +++ b/pkg/operator/resources/validations.go @@ -147,21 +147,23 @@ func ValidateClusterAPIs(apis []userconfig.API, projectFiles spec.ProjectFiles) CPU Reservations: FluentBit 100 -StatsD 100 +StatsDExporter 100 +NodeExporter 100 KubeProxy 100 AWS cni 10 Reserved (150 + 150) see eks.yaml for details */ -var _cortexCPUReserve = kresource.MustParse("610m") +var _cortexCPUReserve = kresource.MustParse("710m") /* Memory Reservations: FluentBit 150 -StatsD 100 +StatsDExporter 100 +NodeExporter 180 Reserved (300 + 300 + 200) see eks.yaml for details */ -var _cortexMemReserve = kresource.MustParse("1050Mi") +var _cortexMemReserve = kresource.MustParse("1230Mi") var _nvidiaCPUReserve = kresource.MustParse("100m") var _nvidiaMemReserve = kresource.MustParse("100Mi") diff --git a/pkg/types/clusterconfig/cluster_config_aws.go b/pkg/types/clusterconfig/cluster_config_aws.go index fa87a122b6..ad4013ed11 100644 --- a/pkg/types/clusterconfig/cluster_config_aws.go +++ b/pkg/types/clusterconfig/cluster_config_aws.go @@ -80,6 +80,8 @@ type CoreConfig struct { ImagePrometheusConfigReloader string `json:"image_prometheus_config_reloader" yaml:"image_prometheus_config_reloader"` ImagePrometheusOperator string `json:"image_prometheus_operator" yaml:"image_prometheus_operator"` ImagePrometheusStatsDExporter string `json:"image_prometheus_statsd_exporter" yaml:"image_prometheus_statsd_exporter"` + ImagePrometheusNodeExporter string `json:"image_prometheus_node_exporter" yaml:"image_prometheus_node_exporter"` + ImageKubeRBACProxy string `json:"image_kube_rbac_proxy" yaml:"image_kube_rbac_proxy"` ImageGrafana string `json:"image_grafana" yaml:"image_grafana"` ImageEventExporter string `json:"image_event_exporter" yaml:"image_event_exporter"` } @@ -332,6 +334,20 @@ var CoreConfigStructFieldValidations = []*cr.StructFieldValidation{ Validator: validateImageVersion, }, }, + { + StructField: "ImagePrometheusNodeExporter", + StringValidation: &cr.StringValidation{ + Default: "quay.io/cortexlabs/prometheus-node-exporter:" + consts.CortexVersion, + Validator: validateImageVersion, + }, + }, + { + StructField: "ImageKubeRBACProxy", + StringValidation: &cr.StringValidation{ + Default: "quay.io/cortexlabs/kube-rbac-proxy:" + consts.CortexVersion, + Validator: validateImageVersion, + }, + }, { StructField: "ImageGrafana", StringValidation: &cr.StringValidation{ @@ -1291,6 +1307,8 @@ func (cc *CoreConfig) UserTable() table.KeyValuePairs { items.Add(ImagePrometheusConfigReloaderUserKey, cc.ImagePrometheusConfigReloader) items.Add(ImagePrometheusOperatorUserKey, cc.ImagePrometheusOperator) items.Add(ImagePrometheusStatsDExporterUserKey, cc.ImagePrometheusStatsDExporter) + items.Add(ImagePrometheusNodeExporterUserKey, cc.ImagePrometheusNodeExporter) + items.Add(ImageKubeRBACProxyUserKey, cc.ImageKubeRBACProxy) items.Add(ImageGrafanaUserKey, cc.ImageGrafana) items.Add(ImageEventExporterUserKey, cc.ImageEventExporter) @@ -1424,6 +1442,12 @@ func (cc *CoreConfig) TelemetryEvent() map[string]interface{} { if strings.HasPrefix(cc.ImagePrometheusStatsDExporter, "cortexlabs/") { event["image_prometheus_statsd_exporter._is_custom"] = true } + if strings.HasPrefix(cc.ImagePrometheusNodeExporter, "cortexlabs/") { + event["image_prometheus_node_exporter._is_custom"] = true + } + if strings.HasPrefix(cc.ImageKubeRBACProxy, "cortexlabs/") { + event["image_kube_rbac_proxy._is_custom"] = true + } if strings.HasPrefix(cc.ImageGrafana, "cortexlabs/") { event["image_grafana._is_custom"] = true } diff --git a/pkg/types/clusterconfig/cluster_config_gcp.go b/pkg/types/clusterconfig/cluster_config_gcp.go index 05ed5dae0f..d0ef58c36c 100644 --- a/pkg/types/clusterconfig/cluster_config_gcp.go +++ b/pkg/types/clusterconfig/cluster_config_gcp.go @@ -57,6 +57,8 @@ type GCPCoreConfig struct { ImagePrometheusConfigReloader string `json:"image_prometheus_config_reloader" yaml:"image_prometheus_config_reloader"` ImagePrometheusOperator string `json:"image_prometheus_operator" yaml:"image_prometheus_operator"` ImagePrometheusStatsDExporter string `json:"image_prometheus_statsd_exporter" yaml:"image_prometheus_statsd_exporter"` + ImagePrometheusNodeExporter string `json:"image_prometheus_node_exporter" yaml:"image_prometheus_node_exporter"` + ImageKubeRBACProxy string `json:"image_kube_rbac_proxy" yaml:"image_kube_rbac_proxy"` ImageGrafana string `json:"image_grafana" yaml:"image_grafana"` ImageEventExporter string `json:"image_event_exporter" yaml:"image_event_exporter"` } @@ -232,6 +234,20 @@ var GCPCoreConfigStructFieldValidations = []*cr.StructFieldValidation{ Validator: validateImageVersion, }, }, + { + StructField: "ImagePrometheusNodeExporter", + StringValidation: &cr.StringValidation{ + Default: "quay.io/cortexlabs/prometheus-node-exporter:" + consts.CortexVersion, + Validator: validateImageVersion, + }, + }, + { + StructField: "ImageKubeRBACProxy", + StringValidation: &cr.StringValidation{ + Default: "quay.io/cortexlabs/kube-rbac-proxy:" + consts.CortexVersion, + Validator: validateImageVersion, + }, + }, { StructField: "ImagePrometheusStatsDExporter", StringValidation: &cr.StringValidation{ @@ -681,6 +697,8 @@ func (cc *GCPCoreConfig) UserTable() table.KeyValuePairs { items.Add(ImagePrometheusConfigReloaderUserKey, cc.ImagePrometheusConfigReloader) items.Add(ImagePrometheusOperatorUserKey, cc.ImagePrometheusOperator) items.Add(ImagePrometheusStatsDExporterUserKey, cc.ImagePrometheusStatsDExporter) + items.Add(ImagePrometheusNodeExporterUserKey, cc.ImagePrometheusNodeExporter) + items.Add(ImageKubeRBACProxyUserKey, cc.ImageKubeRBACProxy) items.Add(ImageGrafanaUserKey, cc.ImageGrafana) items.Add(ImageEventExporterUserKey, cc.ImageEventExporter) @@ -714,7 +732,7 @@ func (cc *GCPManagedConfig) UserTable() table.KeyValuePairs { } func (cc *GCPConfig) UserTable() table.KeyValuePairs { - var items *table.KeyValuePairs = &table.KeyValuePairs{} + items := &table.KeyValuePairs{} items.AddAll(cc.GCPCoreConfig.UserTable()) if cc.GCPCoreConfig.IsManaged { items.AddAll(cc.GCPManagedConfig.UserTable()) @@ -787,6 +805,12 @@ func (cc *GCPCoreConfig) TelemetryEvent() map[string]interface{} { if strings.HasPrefix(cc.ImagePrometheusStatsDExporter, "cortexlabs/") { event["image_prometheus_statsd_exporter._is_custom"] = true } + if strings.HasPrefix(cc.ImagePrometheusNodeExporter, "cortexlabs/") { + event["image_prometheus_node_exporter._is_custom"] = true + } + if strings.HasPrefix(cc.ImageKubeRBACProxy, "cortexlabs/") { + event["image_kube_rbac_proxy._is_custom"] = true + } if strings.HasPrefix(cc.ImageGrafana, "cortexlabs/") { event["image_grafana._is_custom"] = true } diff --git a/pkg/types/clusterconfig/config_key.go b/pkg/types/clusterconfig/config_key.go index cb98821581..6ca580b9b2 100644 --- a/pkg/types/clusterconfig/config_key.go +++ b/pkg/types/clusterconfig/config_key.go @@ -74,6 +74,8 @@ const ( ImagePrometheusConfigReloaderKey = "image_prometheus_config_reloader" ImagePrometheusOperatorKey = "image_prometheus_operator" ImagePrometheusStatsDExporterKey = "image_prometheus_statsd_exporter" + ImagePrometheusNodeExporterKey = "image_prometheus_node_exporter" + ImageKubeRBACProxyKey = "image_kube_rbac_proxy" ImageGrafanaKey = "image_grafana" ImageEventExporterKey = "image_event_exporter" @@ -134,6 +136,8 @@ const ( ImagePrometheusConfigReloaderUserKey = "prometheus config reloader image" ImagePrometheusOperatorUserKey = "prometheus operator image" ImagePrometheusStatsDExporterUserKey = "prometheus statsd exporter image" + ImagePrometheusNodeExporterUserKey = "prometheus node exporter image" + ImageKubeRBACProxyUserKey = "kube rbac proxy image" ImageGrafanaUserKey = "grafana image" ImageEventExporterUserKey = "event exporter image" )