diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b5613d4fb6..bc070614df 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -166,6 +166,9 @@ Add this to your bash profile (e.g. `~/.bash_profile`, `~/.profile` or `~/.bashr # set the default image registry export CORTEX_DEV_DEFAULT_IMAGE_REGISTRY=".dkr.ecr..amazonaws.com/cortexlabs" +# enable api server monitoring in grafana +export CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD="true" + # redirect analytics and error reporting to our dev environment export CORTEX_TELEMETRY_SENTRY_DSN="https://c334df915c014ffa93f2076769e5b334@sentry.io/1848098" export CORTEX_TELEMETRY_SEGMENT_WRITE_KEY="0WvoJyCey9z1W2EW7rYTPJUMRYat46dl" diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go index 7d2a24b014..2b077849e8 100644 --- a/cli/cmd/cluster.go +++ b/cli/cmd/cluster.go @@ -348,6 +348,21 @@ var _clusterConfigureCmd = &cobra.Command{ exit.Error(err) } + restConfig, err := getClusterRESTConfig(awsClient, accessConfig.ClusterName) + if err != nil { + exit.Error(err) + } + + scheme := runtime.NewScheme() + if err := clientgoscheme.AddToScheme(scheme); err != nil { + exit.Error(err) + } + + k8sClient, err := k8s.New("default", false, restConfig, scheme) + if err != nil { + exit.Error(err) + } + stacks, err := clusterstate.GetClusterStacks(awsClient, accessConfig) if err != nil { exit.Error(err) @@ -362,7 +377,7 @@ var _clusterConfigureCmd = &cobra.Command{ promptIfNotAdmin(awsClient, _flagClusterDisallowPrompt) - newClusterConfig, configureChanges, err := getConfigureClusterConfig(awsClient, stacks, oldClusterConfig, clusterConfigFile) + newClusterConfig, configureChanges, err := getConfigureClusterConfig(awsClient, k8sClient, stacks, oldClusterConfig, clusterConfigFile) if err != nil { exit.Error(err) } diff --git a/cli/cmd/lib_cluster_config.go b/cli/cmd/lib_cluster_config.go index b6c652fc8b..3ceeca2615 100644 --- a/cli/cmd/lib_cluster_config.go +++ b/cli/cmd/lib_cluster_config.go @@ -27,6 +27,7 @@ import ( cr "github.com/cortexlabs/cortex/pkg/lib/configreader" "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/files" + "github.com/cortexlabs/cortex/pkg/lib/k8s" "github.com/cortexlabs/cortex/pkg/lib/maps" libmath "github.com/cortexlabs/cortex/pkg/lib/math" "github.com/cortexlabs/cortex/pkg/lib/pointer" @@ -140,7 +141,7 @@ func getInstallClusterConfig(awsClient *aws.Client, clusterConfigFile string) (* return clusterConfig, nil } -func getConfigureClusterConfig(awsClient *aws.Client, stacks clusterstate.ClusterStacks, cachedClusterConfig clusterconfig.Config, newClusterConfigFile string) (*clusterconfig.Config, clusterconfig.ConfigureChanges, error) { +func getConfigureClusterConfig(awsClient *aws.Client, k8sClient *k8s.Client, stacks clusterstate.ClusterStacks, cachedClusterConfig clusterconfig.Config, newClusterConfigFile string) (*clusterconfig.Config, clusterconfig.ConfigureChanges, error) { newUserClusterConfig := &clusterconfig.Config{} err := readUserClusterConfigFile(newUserClusterConfig, newClusterConfigFile) @@ -151,7 +152,7 @@ func getConfigureClusterConfig(awsClient *aws.Client, stacks clusterstate.Cluste newUserClusterConfig.Telemetry = isTelemetryEnabled() cachedClusterConfig.Telemetry = newUserClusterConfig.Telemetry - configureChanges, err := newUserClusterConfig.ValidateOnConfigure(awsClient, cachedClusterConfig, stacks.NodeGroupsStacks) + configureChanges, err := newUserClusterConfig.ValidateOnConfigure(awsClient, k8sClient, cachedClusterConfig, stacks.NodeGroupsStacks) if err != nil { err = errors.Append(err, fmt.Sprintf("\n\ncluster configuration schema can be found at https://docs.cortex.dev/v/%s/", consts.CortexVersionMinor)) return nil, clusterconfig.ConfigureChanges{}, errors.Wrap(err, newClusterConfigFile) diff --git a/cli/cmd/lib_manager.go b/cli/cmd/lib_manager.go index edacce2111..ea2151d248 100644 --- a/cli/cmd/lib_manager.go +++ b/cli/cmd/lib_manager.go @@ -191,6 +191,7 @@ func runManagerWithClusterConfig(entrypoint string, clusterConfig *clusterconfig "CORTEX_TELEMETRY_SENTRY_DSN=" + os.Getenv("CORTEX_TELEMETRY_SENTRY_DSN"), "CORTEX_TELEMETRY_SEGMENT_WRITE_KEY=" + os.Getenv("CORTEX_TELEMETRY_SEGMENT_WRITE_KEY"), "CORTEX_DEV_DEFAULT_IMAGE_REGISTRY=" + os.Getenv("CORTEX_DEV_DEFAULT_IMAGE_REGISTRY"), + "CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD=" + os.Getenv("CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD"), "CORTEX_CLUSTER_CONFIG_FILE=" + containerClusterConfigPath, } envs = append(envs, extraEnvs...) diff --git a/dev/versions.md b/dev/versions.md index ce204f9b6e..853702188a 100644 --- a/dev/versions.md +++ b/dev/versions.md @@ -177,11 +177,14 @@ see https://github.com/moby/moby/issues/39302#issuecomment-639687466_ 1. Find the latest patch release for our current version of k8s (e.g. k8s v1.17 -> cluster-autocluster v1.17.3) on [GitHub](https://github.com/kubernetes/autoscaler/releases) and check the changelog -1. Update the base image in `images/cluster-autoscaler/Dockerfile` to the repository URL shown in the GitHub release 1. In the [GitHub Repo](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws), set the tree to the tag for the chosen release, and open `cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml` (e.g. ) -1. Resolve merge conflicts with the template in `manager/manifests/cluster-autoscaler.yaml.j2` +1. Resolve merge conflicts with the template in `manager/manifests/cluster-autoscaler.yaml.j2`. +1. Pull the release branch from the upstream repo to Cortex's fork on [Github](https://github.com/cortexlabs/autoscaler). +1. Apply the rate-limiter changes from the previous version to the new one (currently sitting on `cluster-autoscaler-release-1.20` branch). +1. Update `-b` flag's value from `git clone` command in `images/cluster-autoscaler/Dockerfile` to the branch name of the latest release from Cortex's fork. +1. Match the Go version of the builder in `images/cluster-autoscaler/Dockerfile` with that of the [cluster autoscaler](https://github.com/kubernetes/autoscaler)'s Dockerfile. ## FluentBit diff --git a/images/cluster-autoscaler/Dockerfile b/images/cluster-autoscaler/Dockerfile index 4d7b17cb71..5b199625b2 100644 --- a/images/cluster-autoscaler/Dockerfile +++ b/images/cluster-autoscaler/Dockerfile @@ -1 +1,11 @@ -FROM k8s.gcr.io/autoscaling/cluster-autoscaler:v1.20.0 +ARG TARGETARCH, TARGETOS + +FROM golang:1.15 AS builder +RUN git clone -b cluster-autoscaler-release-1.20 --depth 1 https://github.com/cortexlabs/autoscaler /k8s.io/autoscaler +WORKDIR /k8s.io/autoscaler/cluster-autoscaler +RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build --installsuffix cgo -o cluster-autoscaler k8s.io/autoscaler/cluster-autoscaler \ + && cp cluster-autoscaler /usr/local/bin + +FROM alpine:3.8 +RUN apk add -U --no-cache ca-certificates && rm -rf /var/cache/apk/* +COPY --from=builder /usr/local/bin/cluster-autoscaler . diff --git a/manager/install.sh b/manager/install.sh index c7d6cd99bd..a4b4914401 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -225,7 +225,9 @@ function setup_prometheus() { envsubst < manifests/prometheus-kubelet-exporter.yaml | kubectl apply -f - >/dev/null envsubst < manifests/prometheus-kube-state-metrics.yaml | kubectl apply -f - >/dev/null envsubst < manifests/prometheus-node-exporter.yaml | kubectl apply -f - >/dev/null - python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/prometheus-monitoring.yaml.j2 | kubectl apply -f - >/dev/null + envsubst < manifests/prometheus-monitoring.yaml | kubectl apply -f - >/dev/null + python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/prometheus-additional-scrape-configs.yaml.j2 > prometheus-additional-scrape-configs.yaml + kubectl create secret generic additional-scrape-configs --from-file=prometheus-additional-scrape-configs.yaml } function setup_grafana() { @@ -235,7 +237,10 @@ function setup_grafana() { kubectl apply -f manifests/grafana/grafana-dashboard-task.yaml >/dev/null kubectl apply -f manifests/grafana/grafana-dashboard-cluster.yaml >/dev/null kubectl apply -f manifests/grafana/grafana-dashboard-nodes.yaml >/dev/null - envsubst < manifests/grafana/grafana.yaml | kubectl apply -f - >/dev/null + if [ "$CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD" = "true" ]; then + kubectl apply -f manifests/grafana/grafana-dashboard-control-plane.yaml >/dev/null + fi + python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/grafana/grafana.yaml.j2 | kubectl apply -f - >/dev/null } function restart_operator() { diff --git a/manager/manifests/cluster-autoscaler.yaml.j2 b/manager/manifests/cluster-autoscaler.yaml.j2 index 0fd3bb918e..d638e140eb 100644 --- a/manager/manifests/cluster-autoscaler.yaml.j2 +++ b/manager/manifests/cluster-autoscaler.yaml.j2 @@ -197,6 +197,9 @@ spec: - --ok-total-unready-count=30 - --max-node-provision-time=8m - --scan-interval=20s + - --scale-up-rate-limit-enabled=true + - --scale-up-max-number-nodes-per-min=50 + - --scale-up-burst-number-nodes-per-min=75 - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/{{ config['cluster_name'] }} volumeMounts: - name: ssl-certs diff --git a/manager/manifests/grafana/grafana-dashboard-control-plane.yaml b/manager/manifests/grafana/grafana-dashboard-control-plane.yaml new file mode 100644 index 0000000000..3a0f26f196 --- /dev/null +++ b/manager/manifests/grafana/grafana-dashboard-control-plane.yaml @@ -0,0 +1,1234 @@ +# Copyright 2021 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-control-plane + namespace: default +data: + control-plane.json: | + { + "annotations": { + "list": [ + { + "builtIn": 1, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Metrics for API Server, Controller and Scheduler.", + "editable": true, + "gnetId": 10907, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 18, + "panels": [], + "title": "API Server Summary", + "type": "row" + }, + { + "cacheTimeout": null, + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": null, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 6, + "links": [], + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "8.0.4", + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(apiserver_request_total[5m]))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{verb}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "API Server RPS", + "type": "gauge" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 5, + "x": 3, + "y": 1 + }, + "hiddenSeries": false, + "id": 47, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(apiserver_current_inflight_requests) by (requestKind)", + "legendFormat": "{{requestKind}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Current Inflight Requests", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 5, + "x": 8, + "y": 1 + }, + "hiddenSeries": false, + "id": 32, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sort": "current", + "sortDesc": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le) )", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Latency - p99", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:45", + "decimals": 2, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:46", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "dashLength": 10, + "dashes": false, + "datasource": null, + "decimals": 2, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 5, + "x": 13, + "y": 1 + }, + "hiddenSeries": false, + "id": 21, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(apiserver_request_duration_seconds_sum{verb!~\"WATCH|CONNECT\"}[5m])) ", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "seconds", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "API Server Response Duration (Seconds)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:101", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:102", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "dashLength": 10, + "dashes": false, + "datasource": null, + "decimals": 0, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 5, + "x": 18, + "y": 1 + }, + "hiddenSeries": false, + "id": 15, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": null, + "sortDesc": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(apiserver_request_duration_seconds_bucket{code=~\"^(?:4..)$|^(?:5..)$\"}[5m])) by (code)>0", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{code}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "API Server Errors", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:157", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:158", + "decimals": 1, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 24, + "panels": [], + "title": "API Server Details", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "dashLength": 10, + "dashes": false, + "datasource": null, + "decimals": 0, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 23, + "x": 0, + "y": 9 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(apiserver_request_total[5m])) by (verb)", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{verb}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "API Server Requests (by Verb)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 23, + "x": 0, + "y": 16 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(apiserver_request_total[5m])) by (resource)", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{resource}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "API Requests (by Resource)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "decimals": 0, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 23, + "x": 0, + "y": 23 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(apiserver_request_total{client!~\"kubelet.*|kube-scheduler.*|kube-controller.*|kube-apiserver.*|kube-proxy.*\"}[5m])) by (client)>0", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{client}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "API Requests", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 23, + "x": 0, + "y": 30 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.90, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le, resource) )", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{resource}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Latency by resource", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:252", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:253", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 23, + "x": 0, + "y": 38 + }, + "hiddenSeries": false, + "id": 49, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.90, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le, verb) )", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{resource}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Latency by verb", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:194", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:195", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "dashLength": 10, + "dashes": false, + "datasource": null, + "decimals": 0, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 23, + "x": 0, + "y": 46 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "aggregation": "Last", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "Never", + "exemplar": true, + "expr": "sum(rate(apiserver_request_total[5m])) by (instance)>0", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}} ", + "refId": "A", + "units": "none", + "valueHandler": "Number Threshold" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "API Server Requests by server", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:223", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:224", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "30s", + "schemaVersion": 30, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "K8S Control Plane", + "uid": "k8scontrolplane", + "version": 2 + } diff --git a/manager/manifests/grafana/grafana.yaml b/manager/manifests/grafana/grafana.yaml.j2 similarity index 92% rename from manager/manifests/grafana/grafana.yaml rename to manager/manifests/grafana/grafana.yaml.j2 index b7299b25d1..ec863d3fb5 100644 --- a/manager/manifests/grafana/grafana.yaml +++ b/manager/manifests/grafana/grafana.yaml.j2 @@ -96,7 +96,7 @@ spec: app: grafana spec: containers: - - image: $CORTEX_IMAGE_GRAFANA + - image: {{ config['image_grafana'] }} name: grafana ports: - containerPort: 3000 @@ -147,6 +147,11 @@ spec: - mountPath: /grafana-dashboard-definitions/cortex/nodes name: grafana-dashboard-nodes readOnly: false + {% if env.get("CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD") == "true" %} + - mountPath: /grafana-dashboard-definitions/cortex/control-plane + name: grafana-dashboard-control-plane + readOnly: false + {% endif %} securityContext: fsGroup: 65534 runAsNonRoot: true @@ -179,6 +184,11 @@ spec: - name: grafana-dashboard-nodes configMap: name: grafana-dashboard-nodes + {% if env.get("CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD") == "true" %} + - name: grafana-dashboard-control-plane + configMap: + name: grafana-dashboard-control-plane + {% endif %} nodeSelector: prometheus: "true" tolerations: diff --git a/manager/manifests/prometheus-additional-scrape-configs.yaml.j2 b/manager/manifests/prometheus-additional-scrape-configs.yaml.j2 new file mode 100644 index 0000000000..fa7718151b --- /dev/null +++ b/manager/manifests/prometheus-additional-scrape-configs.yaml.j2 @@ -0,0 +1,54 @@ +# Copyright 2021 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{% if env.get("CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD") == "true" %} +- job_name: "kubernetes-apiservers" + kubernetes_sd_configs: + - role: endpoints + + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + + # This TLS & authorization config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # If your node certificates are self-signed or use a different CA to the + # master CA, then disable certificate verification below. Note that + # certificate verification is an integral part of a secure infrastructure + # so this should only be disabled in a controlled environment. You can + # disable certificate verification by uncommenting the line below. + # + # insecure_skip_verify: true + authorization: + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + # Keep only the default/kubernetes service endpoints for the https port. This + # will add targets for each API server which Kubernetes adds an endpoint to + # the default/kubernetes service. + relabel_configs: + - source_labels: + [ + __meta_kubernetes_namespace, + __meta_kubernetes_service_name, + __meta_kubernetes_endpoint_port_name, + ] + action: keep + regex: default;kubernetes;https +{% endif %} diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml similarity index 98% rename from manager/manifests/prometheus-monitoring.yaml.j2 rename to manager/manifests/prometheus-monitoring.yaml index b0d76fbc34..0982504aff 100644 --- a/manager/manifests/prometheus-monitoring.yaml.j2 +++ b/manager/manifests/prometheus-monitoring.yaml @@ -28,7 +28,7 @@ kind: Prometheus metadata: name: prometheus spec: - image: {{ config['image_prometheus'] }} + image: $CORTEX_IMAGE_PROMETHEUS serviceAccountName: prometheus nodeSelector: prometheus: "true" @@ -58,6 +58,9 @@ spec: resources: requests: storage: 40Gi + additionalScrapeConfigs: + name: additional-scrape-configs + key: prometheus-additional-scrape-configs.yaml retention: 2w retentionSize: 35GB securityContext: diff --git a/manager/uninstall.sh b/manager/uninstall.sh index 56c7adcd12..5839d8c456 100755 --- a/manager/uninstall.sh +++ b/manager/uninstall.sh @@ -18,8 +18,6 @@ set -e EKSCTL_TIMEOUT=45m -arg1="$1" - function main() { echo aws eks --region $CORTEX_REGION update-kubeconfig --name $CORTEX_CLUSTER_NAME >/dev/null @@ -27,17 +25,4 @@ function main() { echo -e "\n✓ done spinning down the cluster" } -function uninstall_prometheus() { - kubectl get configmap cluster-config -o jsonpath='{.data.cluster\.yaml}' > ./cluster.yaml - - # delete resources to detach disk - python render_template.py ./cluster.yaml manifests/prometheus-monitoring.yaml.j2 | kubectl delete -f - >/dev/null - kubectl delete pvc --namespace default prometheus-prometheus-db-prometheus-prometheus-0 >/dev/null -} - -function uninstall_grafana() { - kubectl delete statefulset --namespace default grafana >/dev/null - kubectl delete pvc --namespace default grafana-storage >/dev/null -} - main diff --git a/pkg/types/clusterconfig/cluster_config.go b/pkg/types/clusterconfig/cluster_config.go index 2fa5740ae0..db0d2f3b08 100644 --- a/pkg/types/clusterconfig/cluster_config.go +++ b/pkg/types/clusterconfig/cluster_config.go @@ -35,6 +35,7 @@ import ( cr "github.com/cortexlabs/cortex/pkg/lib/configreader" "github.com/cortexlabs/cortex/pkg/lib/errors" libhash "github.com/cortexlabs/cortex/pkg/lib/hash" + "github.com/cortexlabs/cortex/pkg/lib/k8s" libmath "github.com/cortexlabs/cortex/pkg/lib/math" "github.com/cortexlabs/cortex/pkg/lib/pointer" "github.com/cortexlabs/cortex/pkg/lib/sets/strset" @@ -45,8 +46,12 @@ import ( ) const ( - // MaxNodePoolsOrGroups represents the max number of node groups in a cluster - MaxNodePoolsOrGroups = 100 + // MaxNodeGroups represents the max number of node groups in a cluster + MaxNodeGroups = 100 + // MaxNodesToAddOnClusterUp represents the max number of nodes to add on cluster up + MaxNodesToAddOnClusterUp = 250 + // MaxNodesToAddOnClusterConfigure represents the max number of nodes to add on cluster up/configure + MaxNodesToAddOnClusterConfigure = 100 // ClusterNameTag is the tag used for storing a cluster's name in AWS resources ClusterNameTag = "cortex.dev/cluster-name" // SQSQueueDelimiter is the delimiter character used for naming cortex SQS queues (e.g. cx__b__) @@ -929,8 +934,8 @@ func (cc *CoreConfig) SQSNamePrefix() string { func (cc *Config) validate(awsClient *aws.Client) error { numNodeGroups := len(cc.NodeGroups) - if numNodeGroups > MaxNodePoolsOrGroups { - return ErrorMaxNumOfNodeGroupsReached(MaxNodePoolsOrGroups) + if numNodeGroups > MaxNodeGroups { + return ErrorMaxNumOfNodeGroupsReached(MaxNodeGroups) } ngNames := []string{} @@ -1147,6 +1152,21 @@ func (cc *Config) validateSharedNodeGroupsDiff(oldConfig Config) error { return nil } +func (cc *Config) validateNodeAdditionRate(k8sClient *k8s.Client) error { + workloadNodes, err := k8sClient.ListNodesByLabel("workload", "true") + if err != nil { + return err + } + totalCurrentNodes := int64(len(workloadNodes)) + totalRequestedNodes := getTotalMinInstances(cc.NodeGroups) + + if totalRequestedNodes-totalCurrentNodes > MaxNodesToAddOnClusterConfigure { + return ErrorMaxNodesToAddOnClusterConfigure(totalRequestedNodes, totalCurrentNodes, MaxNodesToAddOnClusterConfigure) + } + + return nil +} + // this validates the user-provided cluster config func (cc *Config) ValidateOnInstall(awsClient *aws.Client) error { fmt.Print("verifying your configuration ...\n\n") @@ -1156,6 +1176,11 @@ func (cc *Config) ValidateOnInstall(awsClient *aws.Client) error { return err } + requestedTotalMinInstances := getTotalMinInstances(cc.NodeGroups) + if requestedTotalMinInstances > MaxNodesToAddOnClusterUp { + return errors.Wrap(ErrorMaxNodesToAddOnClusterUp(requestedTotalMinInstances, MaxNodesToAddOnClusterUp), NodeGroupsKey) + } + // setting max_instances to 0 during cluster creation is not permitted (but scaling max_instances to 0 afterwards is allowed) for _, nodeGroup := range cc.NodeGroups { if nodeGroup != nil && nodeGroup.MaxInstances == 0 { @@ -1183,7 +1208,7 @@ func (cc *Config) ValidateOnInstall(awsClient *aws.Client) error { return nil } -func (cc *Config) ValidateOnConfigure(awsClient *aws.Client, oldConfig Config, eksNodeGroupStacks []*cloudformation.StackSummary) (ConfigureChanges, error) { +func (cc *Config) ValidateOnConfigure(awsClient *aws.Client, k8sClient *k8s.Client, oldConfig Config, eksNodeGroupStacks []*cloudformation.StackSummary) (ConfigureChanges, error) { fmt.Print("verifying your configuration ...\n\n") cc.ClusterUID = oldConfig.ClusterUID @@ -1202,6 +1227,11 @@ func (cc *Config) ValidateOnConfigure(awsClient *aws.Client, oldConfig Config, e return ConfigureChanges{}, err } + err = cc.validateNodeAdditionRate(k8sClient) + if err != nil { + return ConfigureChanges{}, errors.Wrap(err, NodeGroupsKey) + } + ngsToBeAdded := cc.getNewNodeGroups(oldConfig) ngsToBeRemoved := cc.getRemovedNodeGroups(oldConfig) @@ -1400,6 +1430,14 @@ func (cc *Config) getCommonNodeGroups(oldConfig Config) ([]*NodeGroup, []*NodeGr return commonNewNodeGroups, commonOldNodeGroups } +func getTotalMinInstances(nodeGroups []*NodeGroup) int64 { + totalMinInstances := int64(0) + for _, ng := range nodeGroups { + totalMinInstances += ng.MinInstances + } + return totalMinInstances +} + func GetNodeGroupNames(nodeGroups []*NodeGroup) []string { ngNames := make([]string, len(nodeGroups)) for i := range nodeGroups { diff --git a/pkg/types/clusterconfig/errors.go b/pkg/types/clusterconfig/errors.go index 56d1be5115..5aee00df2b 100644 --- a/pkg/types/clusterconfig/errors.go +++ b/pkg/types/clusterconfig/errors.go @@ -35,6 +35,8 @@ const ( ErrNodeGroupMaxInstancesIsZero = "clusterconfig.node_group_max_instances_is_zero" ErrMaxNumOfNodeGroupsReached = "clusterconfig.max_num_of_nodegroups_reached" ErrDuplicateNodeGroupName = "clusterconfig.duplicate_nodegroup_name" + ErrMaxNodesToAddOnClusterUp = "clusterconfig.max_nodes_to_add_on_cluster_up" + ErrMaxNodesToAddOnClusterConfigure = "clusterconfig.max_nodes_to_add_on_cluster_configure" ErrInstanceTypeTooSmall = "clusterconfig.instance_type_too_small" ErrMinInstancesGreaterThanMax = "clusterconfig.min_instances_greater_than_max" ErrInstanceTypeNotSupportedInRegion = "clusterconfig.instance_type_not_supported_in_region" @@ -127,6 +129,20 @@ func ErrorDuplicateNodeGroupName(duplicateNgName string) error { }) } +func ErrorMaxNodesToAddOnClusterUp(requestedNodes, maxNodes int64) error { + return errors.WithStack(&errors.Error{ + Kind: ErrMaxNodesToAddOnClusterUp, + Message: fmt.Sprintf("cannot create a cluster with %d instances (at most %d instances can be created initially); reduce %s for your nodegroups (you may add additional instances via the `cortex cluster configure` command after your cluster has been created)", requestedNodes, maxNodes, MinInstancesKey), + }) +} + +func ErrorMaxNodesToAddOnClusterConfigure(requestedNodes, currentNodes, maxNodes int64) error { + return errors.WithStack(&errors.Error{ + Kind: ErrMaxNodesToAddOnClusterConfigure, + Message: fmt.Sprintf("cannot add %d instances to your cluster (you requested %d total instances, but your cluster currently has %d instances); only %d instances can be added at time, so reduce the sum of %s across all nodegroups by %d", requestedNodes-currentNodes, requestedNodes, currentNodes, maxNodes, MinInstancesKey, requestedNodes-currentNodes-maxNodes), + }) +} + func ErrorInstanceTypeTooSmall(instanceType string) error { return errors.WithStack(&errors.Error{ Kind: ErrInstanceTypeTooSmall,