diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b5613d4fb6..bc070614df 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -166,6 +166,9 @@ Add this to your bash profile (e.g. `~/.bash_profile`, `~/.profile` or `~/.bashr
 # set the default image registry
 export CORTEX_DEV_DEFAULT_IMAGE_REGISTRY="<account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs"
 
+# enable api server monitoring in grafana
+export CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD="true"
+
 # redirect analytics and error reporting to our dev environment
 export CORTEX_TELEMETRY_SENTRY_DSN="https://c334df915c014ffa93f2076769e5b334@sentry.io/1848098"
 export CORTEX_TELEMETRY_SEGMENT_WRITE_KEY="0WvoJyCey9z1W2EW7rYTPJUMRYat46dl"
diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go
index 7d2a24b014..2b077849e8 100644
--- a/cli/cmd/cluster.go
+++ b/cli/cmd/cluster.go
@@ -348,6 +348,21 @@ var _clusterConfigureCmd = &cobra.Command{
 			exit.Error(err)
 		}
 
+		restConfig, err := getClusterRESTConfig(awsClient, accessConfig.ClusterName)
+		if err != nil {
+			exit.Error(err)
+		}
+
+		scheme := runtime.NewScheme()
+		if err := clientgoscheme.AddToScheme(scheme); err != nil {
+			exit.Error(err)
+		}
+
+		k8sClient, err := k8s.New("default", false, restConfig, scheme)
+		if err != nil {
+			exit.Error(err)
+		}
+
 		stacks, err := clusterstate.GetClusterStacks(awsClient, accessConfig)
 		if err != nil {
 			exit.Error(err)
@@ -362,7 +377,7 @@ var _clusterConfigureCmd = &cobra.Command{
 
 		promptIfNotAdmin(awsClient, _flagClusterDisallowPrompt)
 
-		newClusterConfig, configureChanges, err := getConfigureClusterConfig(awsClient, stacks, oldClusterConfig, clusterConfigFile)
+		newClusterConfig, configureChanges, err := getConfigureClusterConfig(awsClient, k8sClient, stacks, oldClusterConfig, clusterConfigFile)
 		if err != nil {
 			exit.Error(err)
 		}
diff --git a/cli/cmd/lib_cluster_config.go b/cli/cmd/lib_cluster_config.go
index b6c652fc8b..3ceeca2615 100644
--- a/cli/cmd/lib_cluster_config.go
+++ b/cli/cmd/lib_cluster_config.go
@@ -27,6 +27,7 @@ import (
 	cr "github.com/cortexlabs/cortex/pkg/lib/configreader"
 	"github.com/cortexlabs/cortex/pkg/lib/errors"
 	"github.com/cortexlabs/cortex/pkg/lib/files"
+	"github.com/cortexlabs/cortex/pkg/lib/k8s"
 	"github.com/cortexlabs/cortex/pkg/lib/maps"
 	libmath "github.com/cortexlabs/cortex/pkg/lib/math"
 	"github.com/cortexlabs/cortex/pkg/lib/pointer"
@@ -140,7 +141,7 @@ func getInstallClusterConfig(awsClient *aws.Client, clusterConfigFile string) (*
 	return clusterConfig, nil
 }
 
-func getConfigureClusterConfig(awsClient *aws.Client, stacks clusterstate.ClusterStacks, cachedClusterConfig clusterconfig.Config, newClusterConfigFile string) (*clusterconfig.Config, clusterconfig.ConfigureChanges, error) {
+func getConfigureClusterConfig(awsClient *aws.Client, k8sClient *k8s.Client, stacks clusterstate.ClusterStacks, cachedClusterConfig clusterconfig.Config, newClusterConfigFile string) (*clusterconfig.Config, clusterconfig.ConfigureChanges, error) {
 	newUserClusterConfig := &clusterconfig.Config{}
 
 	err := readUserClusterConfigFile(newUserClusterConfig, newClusterConfigFile)
@@ -151,7 +152,7 @@ func getConfigureClusterConfig(awsClient *aws.Client, stacks clusterstate.Cluste
 	newUserClusterConfig.Telemetry = isTelemetryEnabled()
 	cachedClusterConfig.Telemetry = newUserClusterConfig.Telemetry
 
-	configureChanges, err := newUserClusterConfig.ValidateOnConfigure(awsClient, cachedClusterConfig, stacks.NodeGroupsStacks)
+	configureChanges, err := newUserClusterConfig.ValidateOnConfigure(awsClient, k8sClient, cachedClusterConfig, stacks.NodeGroupsStacks)
 	if err != nil {
 		err = errors.Append(err, fmt.Sprintf("\n\ncluster configuration schema can be found at https://docs.cortex.dev/v/%s/", consts.CortexVersionMinor))
 		return nil, clusterconfig.ConfigureChanges{}, errors.Wrap(err, newClusterConfigFile)
diff --git a/cli/cmd/lib_manager.go b/cli/cmd/lib_manager.go
index edacce2111..ea2151d248 100644
--- a/cli/cmd/lib_manager.go
+++ b/cli/cmd/lib_manager.go
@@ -191,6 +191,7 @@ func runManagerWithClusterConfig(entrypoint string, clusterConfig *clusterconfig
 		"CORTEX_TELEMETRY_SENTRY_DSN=" + os.Getenv("CORTEX_TELEMETRY_SENTRY_DSN"),
 		"CORTEX_TELEMETRY_SEGMENT_WRITE_KEY=" + os.Getenv("CORTEX_TELEMETRY_SEGMENT_WRITE_KEY"),
 		"CORTEX_DEV_DEFAULT_IMAGE_REGISTRY=" + os.Getenv("CORTEX_DEV_DEFAULT_IMAGE_REGISTRY"),
+		"CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD=" + os.Getenv("CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD"),
 		"CORTEX_CLUSTER_CONFIG_FILE=" + containerClusterConfigPath,
 	}
 	envs = append(envs, extraEnvs...)
diff --git a/dev/versions.md b/dev/versions.md
index ce204f9b6e..853702188a 100644
--- a/dev/versions.md
+++ b/dev/versions.md
@@ -177,11 +177,14 @@ see https://github.com/moby/moby/issues/39302#issuecomment-639687466_
 
 1. Find the latest patch release for our current version of k8s (e.g. k8s v1.17 -> cluster-autocluster v1.17.3)
    on [GitHub](https://github.com/kubernetes/autoscaler/releases) and check the changelog
-1. Update the base image in `images/cluster-autoscaler/Dockerfile` to the repository URL shown in the GitHub release
 1. In the [GitHub Repo](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws),
    set the tree to the tag for the chosen release, and open `cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml`
    (e.g. <https://github.com/kubernetes/autoscaler/blob/cluster-autoscaler-1.20.0/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml>)
-1. Resolve merge conflicts with the template in `manager/manifests/cluster-autoscaler.yaml.j2`
+1. Resolve merge conflicts with the template in `manager/manifests/cluster-autoscaler.yaml.j2`.
+1. Pull the release branch from the upstream repo to Cortex's fork on [Github](https://github.com/cortexlabs/autoscaler).
+1. Apply the rate-limiter changes from the previous version to the new one (currently sitting on `cluster-autoscaler-release-1.20` branch).
+1. Update `-b` flag's value from `git clone` command in `images/cluster-autoscaler/Dockerfile` to the branch name of the latest release from Cortex's fork.
+1. Match the Go version of the builder in `images/cluster-autoscaler/Dockerfile` with that of the [cluster autoscaler](https://github.com/kubernetes/autoscaler)'s Dockerfile.
 
 ## FluentBit
 
diff --git a/images/cluster-autoscaler/Dockerfile b/images/cluster-autoscaler/Dockerfile
index 4d7b17cb71..5b199625b2 100644
--- a/images/cluster-autoscaler/Dockerfile
+++ b/images/cluster-autoscaler/Dockerfile
@@ -1 +1,11 @@
-FROM k8s.gcr.io/autoscaling/cluster-autoscaler:v1.20.0
+ARG TARGETARCH, TARGETOS
+
+FROM golang:1.15 AS builder
+RUN git clone -b cluster-autoscaler-release-1.20 --depth 1 https://github.com/cortexlabs/autoscaler /k8s.io/autoscaler
+WORKDIR /k8s.io/autoscaler/cluster-autoscaler
+RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build --installsuffix cgo -o cluster-autoscaler k8s.io/autoscaler/cluster-autoscaler \
+    && cp cluster-autoscaler /usr/local/bin
+
+FROM alpine:3.8
+RUN apk add -U --no-cache ca-certificates && rm -rf /var/cache/apk/*
+COPY --from=builder /usr/local/bin/cluster-autoscaler .
diff --git a/manager/install.sh b/manager/install.sh
index c7d6cd99bd..a4b4914401 100755
--- a/manager/install.sh
+++ b/manager/install.sh
@@ -225,7 +225,9 @@ function setup_prometheus() {
   envsubst < manifests/prometheus-kubelet-exporter.yaml | kubectl apply -f - >/dev/null
   envsubst < manifests/prometheus-kube-state-metrics.yaml | kubectl apply -f - >/dev/null
   envsubst < manifests/prometheus-node-exporter.yaml | kubectl apply -f - >/dev/null
-  python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/prometheus-monitoring.yaml.j2 | kubectl apply -f - >/dev/null
+  envsubst < manifests/prometheus-monitoring.yaml | kubectl apply -f - >/dev/null
+  python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/prometheus-additional-scrape-configs.yaml.j2 > prometheus-additional-scrape-configs.yaml
+  kubectl create secret generic additional-scrape-configs --from-file=prometheus-additional-scrape-configs.yaml
 }
 
 function setup_grafana() {
@@ -235,7 +237,10 @@ function setup_grafana() {
   kubectl apply -f manifests/grafana/grafana-dashboard-task.yaml >/dev/null
   kubectl apply -f manifests/grafana/grafana-dashboard-cluster.yaml >/dev/null
   kubectl apply -f manifests/grafana/grafana-dashboard-nodes.yaml >/dev/null
-  envsubst < manifests/grafana/grafana.yaml | kubectl apply -f - >/dev/null
+  if [ "$CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD" = "true" ]; then
+    kubectl apply -f manifests/grafana/grafana-dashboard-control-plane.yaml >/dev/null
+  fi
+  python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/grafana/grafana.yaml.j2 | kubectl apply -f - >/dev/null
 }
 
 function restart_operator() {
diff --git a/manager/manifests/cluster-autoscaler.yaml.j2 b/manager/manifests/cluster-autoscaler.yaml.j2
index 0fd3bb918e..d638e140eb 100644
--- a/manager/manifests/cluster-autoscaler.yaml.j2
+++ b/manager/manifests/cluster-autoscaler.yaml.j2
@@ -197,6 +197,9 @@ spec:
             - --ok-total-unready-count=30
             - --max-node-provision-time=8m
             - --scan-interval=20s
+            - --scale-up-rate-limit-enabled=true
+            - --scale-up-max-number-nodes-per-min=50
+            - --scale-up-burst-number-nodes-per-min=75
             - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/{{ config['cluster_name'] }}
           volumeMounts:
             - name: ssl-certs
diff --git a/manager/manifests/grafana/grafana-dashboard-control-plane.yaml b/manager/manifests/grafana/grafana-dashboard-control-plane.yaml
new file mode 100644
index 0000000000..3a0f26f196
--- /dev/null
+++ b/manager/manifests/grafana/grafana-dashboard-control-plane.yaml
@@ -0,0 +1,1234 @@
+# Copyright 2021 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-control-plane
+  namespace: default
+data:
+  control-plane.json: |
+    {
+      "annotations": {
+        "list": [
+          {
+            "builtIn": 1,
+            "enable": true,
+            "hide": true,
+            "iconColor": "rgba(0, 211, 255, 1)",
+            "name": "Annotations & Alerts",
+            "type": "dashboard"
+          }
+        ]
+      },
+      "description": "Metrics for API Server, Controller and Scheduler.",
+      "editable": true,
+      "gnetId": 10907,
+      "graphTooltip": 0,
+      "links": [],
+      "panels": [
+        {
+          "collapsed": false,
+          "datasource": null,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 0
+          },
+          "id": 18,
+          "panels": [],
+          "title": "API Server Summary",
+          "type": "row"
+        },
+        {
+          "cacheTimeout": null,
+          "datasource": null,
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "decimals": null,
+              "mappings": [],
+              "max": 100,
+              "min": 0,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 80
+                  }
+                ]
+              },
+              "unit": "none"
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 3,
+            "x": 0,
+            "y": 1
+          },
+          "id": 6,
+          "links": [],
+          "options": {
+            "orientation": "auto",
+            "reduceOptions": {
+              "calcs": [
+                "last"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "showThresholdLabels": false,
+            "showThresholdMarkers": true,
+            "text": {}
+          },
+          "pluginVersion": "8.0.4",
+          "targets": [
+            {
+              "exemplar": true,
+              "expr": "sum(rate(apiserver_request_total[5m]))",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "intervalFactor": 1,
+              "legendFormat": "{{verb}}",
+              "refId": "A"
+            }
+          ],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "API Server RPS",
+          "type": "gauge"
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "fieldConfig": {
+            "defaults": {
+              "links": []
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 7,
+            "w": 5,
+            "x": 3,
+            "y": 1
+          },
+          "hiddenSeries": false,
+          "id": 47,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "8.0.4",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(apiserver_current_inflight_requests) by (requestKind)",
+              "legendFormat": "{{requestKind}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Current Inflight Requests",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "fieldConfig": {
+            "defaults": {
+              "links": []
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 7,
+            "w": 5,
+            "x": 8,
+            "y": 1
+          },
+          "hiddenSeries": false,
+          "id": 32,
+          "legend": {
+            "alignAsTable": false,
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "rightSide": false,
+            "show": false,
+            "sort": "current",
+            "sortDesc": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "8.0.4",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": true,
+              "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le) )",
+              "format": "time_series",
+              "interval": "",
+              "intervalFactor": 1,
+              "legendFormat": "",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Latency - p99",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:45",
+              "decimals": 2,
+              "format": "s",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "$$hashKey": "object:46",
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "cacheTimeout": null,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "decimals": 2,
+          "fieldConfig": {
+            "defaults": {
+              "links": []
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 7,
+            "w": 5,
+            "x": 13,
+            "y": 1
+          },
+          "hiddenSeries": false,
+          "id": 21,
+          "legend": {
+            "alignAsTable": false,
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "rightSide": false,
+            "show": true,
+            "sort": "current",
+            "sortDesc": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "8.0.4",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": true,
+              "expr": "sum(rate(apiserver_request_duration_seconds_sum{verb!~\"WATCH|CONNECT\"}[5m])) ",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "intervalFactor": 1,
+              "legendFormat": "seconds",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "API Server Response Duration (Seconds)",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:101",
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "$$hashKey": "object:102",
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "cacheTimeout": null,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "decimals": 0,
+          "fieldConfig": {
+            "defaults": {
+              "links": []
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 7,
+            "w": 5,
+            "x": 18,
+            "y": 1
+          },
+          "hiddenSeries": false,
+          "id": 15,
+          "legend": {
+            "alignAsTable": false,
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "rightSide": false,
+            "show": true,
+            "sort": null,
+            "sortDesc": null,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "8.0.4",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": true,
+              "expr": "sum(rate(apiserver_request_duration_seconds_bucket{code=~\"^(?:4..)$|^(?:5..)$\"}[5m])) by (code)>0",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "intervalFactor": 1,
+              "legendFormat": "{{code}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "API Server Errors",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:157",
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "$$hashKey": "object:158",
+              "decimals": 1,
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "collapsed": false,
+          "datasource": null,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 8
+          },
+          "id": 24,
+          "panels": [],
+          "title": "API Server Details",
+          "type": "row"
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "cacheTimeout": null,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "decimals": 0,
+          "fieldConfig": {
+            "defaults": {
+              "links": []
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 7,
+            "w": 23,
+            "x": 0,
+            "y": 9
+          },
+          "hiddenSeries": false,
+          "id": 12,
+          "legend": {
+            "alignAsTable": true,
+            "avg": false,
+            "current": true,
+            "max": false,
+            "min": false,
+            "rightSide": true,
+            "show": true,
+            "sort": "current",
+            "sortDesc": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "8.0.4",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": true,
+              "expr": "sum(rate(apiserver_request_total[5m])) by (verb)",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "intervalFactor": 1,
+              "legendFormat": "{{verb}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "API Server Requests (by Verb)",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "fieldConfig": {
+            "defaults": {
+              "links": []
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 7,
+            "w": 23,
+            "x": 0,
+            "y": 16
+          },
+          "hiddenSeries": false,
+          "id": 8,
+          "legend": {
+            "alignAsTable": true,
+            "avg": false,
+            "current": true,
+            "max": false,
+            "min": false,
+            "rightSide": true,
+            "show": true,
+            "sort": "current",
+            "sortDesc": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "8.0.4",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": true,
+              "expr": "sum(rate(apiserver_request_total[5m])) by (resource)",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "intervalFactor": 1,
+              "legendFormat": "{{resource}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "API Requests (by Resource)",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "decimals": 0,
+          "fieldConfig": {
+            "defaults": {
+              "links": []
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 7,
+            "w": 23,
+            "x": 0,
+            "y": 23
+          },
+          "hiddenSeries": false,
+          "id": 9,
+          "legend": {
+            "alignAsTable": true,
+            "avg": false,
+            "current": true,
+            "max": false,
+            "min": false,
+            "rightSide": true,
+            "show": true,
+            "sort": "current",
+            "sortDesc": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "8.0.4",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": true,
+              "expr": "sum(rate(apiserver_request_total{client!~\"kubelet.*|kube-scheduler.*|kube-controller.*|kube-apiserver.*|kube-proxy.*\"}[5m])) by (client)>0",
+              "format": "time_series",
+              "interval": "",
+              "intervalFactor": 1,
+              "legendFormat": "{{client}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "API Requests",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "fieldConfig": {
+            "defaults": {
+              "links": []
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 23,
+            "x": 0,
+            "y": 30
+          },
+          "hiddenSeries": false,
+          "id": 2,
+          "legend": {
+            "alignAsTable": true,
+            "avg": false,
+            "current": true,
+            "max": false,
+            "min": false,
+            "rightSide": true,
+            "show": true,
+            "sort": "current",
+            "sortDesc": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "8.0.4",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": true,
+              "expr": "histogram_quantile(0.90, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le, resource) )",
+              "format": "time_series",
+              "interval": "",
+              "intervalFactor": 1,
+              "legendFormat": "{{resource}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Latency by resource",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:252",
+              "format": "s",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "$$hashKey": "object:253",
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "fieldConfig": {
+            "defaults": {
+              "links": []
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 23,
+            "x": 0,
+            "y": 38
+          },
+          "hiddenSeries": false,
+          "id": 49,
+          "legend": {
+            "alignAsTable": true,
+            "avg": false,
+            "current": true,
+            "max": false,
+            "min": false,
+            "rightSide": true,
+            "show": true,
+            "sort": "current",
+            "sortDesc": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "8.0.4",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": true,
+              "expr": "histogram_quantile(0.90, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le, verb) )",
+              "format": "time_series",
+              "interval": "",
+              "intervalFactor": 1,
+              "legendFormat": "{{resource}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Latency by verb",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:194",
+              "format": "s",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "$$hashKey": "object:195",
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "cacheTimeout": null,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "decimals": 0,
+          "fieldConfig": {
+            "defaults": {
+              "links": []
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 23,
+            "x": 0,
+            "y": 46
+          },
+          "hiddenSeries": false,
+          "id": 10,
+          "legend": {
+            "alignAsTable": true,
+            "avg": false,
+            "current": true,
+            "max": false,
+            "min": false,
+            "rightSide": true,
+            "show": true,
+            "sort": "current",
+            "sortDesc": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "8.0.4",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "aggregation": "Last",
+              "decimals": 2,
+              "displayAliasType": "Warning / Critical",
+              "displayType": "Regular",
+              "displayValueWithAlias": "Never",
+              "exemplar": true,
+              "expr": "sum(rate(apiserver_request_total[5m])) by (instance)>0",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "intervalFactor": 1,
+              "legendFormat": "{{instance}} ",
+              "refId": "A",
+              "units": "none",
+              "valueHandler": "Number Threshold"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "API Server Requests by server",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:223",
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "$$hashKey": "object:224",
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        }
+      ],
+      "refresh": "30s",
+      "schemaVersion": 30,
+      "style": "dark",
+      "tags": [],
+      "templating": {
+        "list": []
+      },
+      "time": {
+        "from": "now-1h",
+        "to": "now"
+      },
+      "timepicker": {
+        "refresh_intervals": [
+          "5s",
+          "10s",
+          "30s",
+          "1m",
+          "5m",
+          "15m",
+          "30m",
+          "1h",
+          "2h",
+          "1d"
+        ],
+        "time_options": [
+          "5m",
+          "15m",
+          "1h",
+          "6h",
+          "12h",
+          "24h",
+          "2d",
+          "7d",
+          "30d"
+        ]
+      },
+      "timezone": "",
+      "title": "K8S Control Plane",
+      "uid": "k8scontrolplane",
+      "version": 2
+    }
diff --git a/manager/manifests/grafana/grafana.yaml b/manager/manifests/grafana/grafana.yaml.j2
similarity index 92%
rename from manager/manifests/grafana/grafana.yaml
rename to manager/manifests/grafana/grafana.yaml.j2
index b7299b25d1..ec863d3fb5 100644
--- a/manager/manifests/grafana/grafana.yaml
+++ b/manager/manifests/grafana/grafana.yaml.j2
@@ -96,7 +96,7 @@ spec:
         app: grafana
     spec:
       containers:
-        - image: $CORTEX_IMAGE_GRAFANA
+        - image: {{ config['image_grafana'] }}
           name: grafana
           ports:
             - containerPort: 3000
@@ -147,6 +147,11 @@ spec:
             - mountPath: /grafana-dashboard-definitions/cortex/nodes
               name: grafana-dashboard-nodes
               readOnly: false
+            {% if env.get("CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD") == "true" %}
+            - mountPath: /grafana-dashboard-definitions/cortex/control-plane
+              name: grafana-dashboard-control-plane
+              readOnly: false
+            {% endif %}
       securityContext:
         fsGroup: 65534
         runAsNonRoot: true
@@ -179,6 +184,11 @@ spec:
         - name: grafana-dashboard-nodes
           configMap:
             name: grafana-dashboard-nodes
+        {% if env.get("CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD") == "true" %}
+        - name: grafana-dashboard-control-plane
+          configMap:
+            name: grafana-dashboard-control-plane
+        {% endif %}
       nodeSelector:
         prometheus: "true"
       tolerations:
diff --git a/manager/manifests/prometheus-additional-scrape-configs.yaml.j2 b/manager/manifests/prometheus-additional-scrape-configs.yaml.j2
new file mode 100644
index 0000000000..fa7718151b
--- /dev/null
+++ b/manager/manifests/prometheus-additional-scrape-configs.yaml.j2
@@ -0,0 +1,54 @@
+# Copyright 2021 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{% if env.get("CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD") == "true" %}
+- job_name: "kubernetes-apiservers"
+  kubernetes_sd_configs:
+    - role: endpoints
+
+  # Default to scraping over https. If required, just disable this or change to
+  # `http`.
+  scheme: https
+
+  # This TLS & authorization config is used to connect to the actual scrape
+  # endpoints for cluster components. This is separate to discovery auth
+  # configuration because discovery & scraping are two separate concerns in
+  # Prometheus. The discovery auth config is automatic if Prometheus runs inside
+  # the cluster. Otherwise, more config options have to be provided within the
+  # <kubernetes_sd_config>.
+  tls_config:
+    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+    # If your node certificates are self-signed or use a different CA to the
+    # master CA, then disable certificate verification below. Note that
+    # certificate verification is an integral part of a secure infrastructure
+    # so this should only be disabled in a controlled environment. You can
+    # disable certificate verification by uncommenting the line below.
+    #
+    # insecure_skip_verify: true
+  authorization:
+    credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+  # Keep only the default/kubernetes service endpoints for the https port. This
+  # will add targets for each API server which Kubernetes adds an endpoint to
+  # the default/kubernetes service.
+  relabel_configs:
+    - source_labels:
+        [
+          __meta_kubernetes_namespace,
+          __meta_kubernetes_service_name,
+          __meta_kubernetes_endpoint_port_name,
+        ]
+      action: keep
+      regex: default;kubernetes;https
+{% endif %}
diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml
similarity index 98%
rename from manager/manifests/prometheus-monitoring.yaml.j2
rename to manager/manifests/prometheus-monitoring.yaml
index b0d76fbc34..0982504aff 100644
--- a/manager/manifests/prometheus-monitoring.yaml.j2
+++ b/manager/manifests/prometheus-monitoring.yaml
@@ -28,7 +28,7 @@ kind: Prometheus
 metadata:
   name: prometheus
 spec:
-  image: {{ config['image_prometheus'] }}
+  image: $CORTEX_IMAGE_PROMETHEUS
   serviceAccountName: prometheus
   nodeSelector:
     prometheus: "true"
@@ -58,6 +58,9 @@ spec:
         resources:
           requests:
             storage: 40Gi
+  additionalScrapeConfigs:
+    name: additional-scrape-configs
+    key: prometheus-additional-scrape-configs.yaml
   retention: 2w
   retentionSize: 35GB
   securityContext:
diff --git a/manager/uninstall.sh b/manager/uninstall.sh
index 56c7adcd12..5839d8c456 100755
--- a/manager/uninstall.sh
+++ b/manager/uninstall.sh
@@ -18,8 +18,6 @@ set -e
 
 EKSCTL_TIMEOUT=45m
 
-arg1="$1"
-
 function main() {
   echo
   aws eks --region $CORTEX_REGION update-kubeconfig --name $CORTEX_CLUSTER_NAME >/dev/null
@@ -27,17 +25,4 @@ function main() {
   echo -e "\n✓ done spinning down the cluster"
 }
 
-function uninstall_prometheus() {
-  kubectl get configmap cluster-config -o jsonpath='{.data.cluster\.yaml}' > ./cluster.yaml
-
-  # delete resources to detach disk
-  python render_template.py ./cluster.yaml manifests/prometheus-monitoring.yaml.j2 | kubectl delete -f - >/dev/null
-  kubectl delete pvc --namespace default prometheus-prometheus-db-prometheus-prometheus-0 >/dev/null
-}
-
-function uninstall_grafana() {
-  kubectl delete statefulset --namespace default grafana >/dev/null
-  kubectl delete pvc --namespace default grafana-storage >/dev/null
-}
-
 main
diff --git a/pkg/types/clusterconfig/cluster_config.go b/pkg/types/clusterconfig/cluster_config.go
index 2fa5740ae0..db0d2f3b08 100644
--- a/pkg/types/clusterconfig/cluster_config.go
+++ b/pkg/types/clusterconfig/cluster_config.go
@@ -35,6 +35,7 @@ import (
 	cr "github.com/cortexlabs/cortex/pkg/lib/configreader"
 	"github.com/cortexlabs/cortex/pkg/lib/errors"
 	libhash "github.com/cortexlabs/cortex/pkg/lib/hash"
+	"github.com/cortexlabs/cortex/pkg/lib/k8s"
 	libmath "github.com/cortexlabs/cortex/pkg/lib/math"
 	"github.com/cortexlabs/cortex/pkg/lib/pointer"
 	"github.com/cortexlabs/cortex/pkg/lib/sets/strset"
@@ -45,8 +46,12 @@ import (
 )
 
 const (
-	// MaxNodePoolsOrGroups represents the max number of node groups in a cluster
-	MaxNodePoolsOrGroups = 100
+	// MaxNodeGroups represents the max number of node groups in a cluster
+	MaxNodeGroups = 100
+	// MaxNodesToAddOnClusterUp represents the max number of nodes to add on cluster up
+	MaxNodesToAddOnClusterUp = 250
+	// MaxNodesToAddOnClusterConfigure represents the max number of nodes to add on cluster up/configure
+	MaxNodesToAddOnClusterConfigure = 100
 	// ClusterNameTag is the tag used for storing a cluster's name in AWS resources
 	ClusterNameTag = "cortex.dev/cluster-name"
 	// SQSQueueDelimiter is the delimiter character used for naming cortex SQS queues (e.g. cx_<cluster_hash>_b_<api_name>_<jon_id>)
@@ -929,8 +934,8 @@ func (cc *CoreConfig) SQSNamePrefix() string {
 
 func (cc *Config) validate(awsClient *aws.Client) error {
 	numNodeGroups := len(cc.NodeGroups)
-	if numNodeGroups > MaxNodePoolsOrGroups {
-		return ErrorMaxNumOfNodeGroupsReached(MaxNodePoolsOrGroups)
+	if numNodeGroups > MaxNodeGroups {
+		return ErrorMaxNumOfNodeGroupsReached(MaxNodeGroups)
 	}
 
 	ngNames := []string{}
@@ -1147,6 +1152,21 @@ func (cc *Config) validateSharedNodeGroupsDiff(oldConfig Config) error {
 	return nil
 }
 
+func (cc *Config) validateNodeAdditionRate(k8sClient *k8s.Client) error {
+	workloadNodes, err := k8sClient.ListNodesByLabel("workload", "true")
+	if err != nil {
+		return err
+	}
+	totalCurrentNodes := int64(len(workloadNodes))
+	totalRequestedNodes := getTotalMinInstances(cc.NodeGroups)
+
+	if totalRequestedNodes-totalCurrentNodes > MaxNodesToAddOnClusterConfigure {
+		return ErrorMaxNodesToAddOnClusterConfigure(totalRequestedNodes, totalCurrentNodes, MaxNodesToAddOnClusterConfigure)
+	}
+
+	return nil
+}
+
 // this validates the user-provided cluster config
 func (cc *Config) ValidateOnInstall(awsClient *aws.Client) error {
 	fmt.Print("verifying your configuration ...\n\n")
@@ -1156,6 +1176,11 @@ func (cc *Config) ValidateOnInstall(awsClient *aws.Client) error {
 		return err
 	}
 
+	requestedTotalMinInstances := getTotalMinInstances(cc.NodeGroups)
+	if requestedTotalMinInstances > MaxNodesToAddOnClusterUp {
+		return errors.Wrap(ErrorMaxNodesToAddOnClusterUp(requestedTotalMinInstances, MaxNodesToAddOnClusterUp), NodeGroupsKey)
+	}
+
 	// setting max_instances to 0 during cluster creation is not permitted (but scaling max_instances to 0 afterwards is allowed)
 	for _, nodeGroup := range cc.NodeGroups {
 		if nodeGroup != nil && nodeGroup.MaxInstances == 0 {
@@ -1183,7 +1208,7 @@ func (cc *Config) ValidateOnInstall(awsClient *aws.Client) error {
 	return nil
 }
 
-func (cc *Config) ValidateOnConfigure(awsClient *aws.Client, oldConfig Config, eksNodeGroupStacks []*cloudformation.StackSummary) (ConfigureChanges, error) {
+func (cc *Config) ValidateOnConfigure(awsClient *aws.Client, k8sClient *k8s.Client, oldConfig Config, eksNodeGroupStacks []*cloudformation.StackSummary) (ConfigureChanges, error) {
 	fmt.Print("verifying your configuration ...\n\n")
 
 	cc.ClusterUID = oldConfig.ClusterUID
@@ -1202,6 +1227,11 @@ func (cc *Config) ValidateOnConfigure(awsClient *aws.Client, oldConfig Config, e
 		return ConfigureChanges{}, err
 	}
 
+	err = cc.validateNodeAdditionRate(k8sClient)
+	if err != nil {
+		return ConfigureChanges{}, errors.Wrap(err, NodeGroupsKey)
+	}
+
 	ngsToBeAdded := cc.getNewNodeGroups(oldConfig)
 	ngsToBeRemoved := cc.getRemovedNodeGroups(oldConfig)
 
@@ -1400,6 +1430,14 @@ func (cc *Config) getCommonNodeGroups(oldConfig Config) ([]*NodeGroup, []*NodeGr
 	return commonNewNodeGroups, commonOldNodeGroups
 }
 
+func getTotalMinInstances(nodeGroups []*NodeGroup) int64 {
+	totalMinInstances := int64(0)
+	for _, ng := range nodeGroups {
+		totalMinInstances += ng.MinInstances
+	}
+	return totalMinInstances
+}
+
 func GetNodeGroupNames(nodeGroups []*NodeGroup) []string {
 	ngNames := make([]string, len(nodeGroups))
 	for i := range nodeGroups {
diff --git a/pkg/types/clusterconfig/errors.go b/pkg/types/clusterconfig/errors.go
index 56d1be5115..5aee00df2b 100644
--- a/pkg/types/clusterconfig/errors.go
+++ b/pkg/types/clusterconfig/errors.go
@@ -35,6 +35,8 @@ const (
 	ErrNodeGroupMaxInstancesIsZero            = "clusterconfig.node_group_max_instances_is_zero"
 	ErrMaxNumOfNodeGroupsReached              = "clusterconfig.max_num_of_nodegroups_reached"
 	ErrDuplicateNodeGroupName                 = "clusterconfig.duplicate_nodegroup_name"
+	ErrMaxNodesToAddOnClusterUp               = "clusterconfig.max_nodes_to_add_on_cluster_up"
+	ErrMaxNodesToAddOnClusterConfigure        = "clusterconfig.max_nodes_to_add_on_cluster_configure"
 	ErrInstanceTypeTooSmall                   = "clusterconfig.instance_type_too_small"
 	ErrMinInstancesGreaterThanMax             = "clusterconfig.min_instances_greater_than_max"
 	ErrInstanceTypeNotSupportedInRegion       = "clusterconfig.instance_type_not_supported_in_region"
@@ -127,6 +129,20 @@ func ErrorDuplicateNodeGroupName(duplicateNgName string) error {
 	})
 }
 
+func ErrorMaxNodesToAddOnClusterUp(requestedNodes, maxNodes int64) error {
+	return errors.WithStack(&errors.Error{
+		Kind:    ErrMaxNodesToAddOnClusterUp,
+		Message: fmt.Sprintf("cannot create a cluster with %d instances (at most %d instances can be created initially); reduce %s for your nodegroups (you may add additional instances via the `cortex cluster configure` command after your cluster has been created)", requestedNodes, maxNodes, MinInstancesKey),
+	})
+}
+
+func ErrorMaxNodesToAddOnClusterConfigure(requestedNodes, currentNodes, maxNodes int64) error {
+	return errors.WithStack(&errors.Error{
+		Kind:    ErrMaxNodesToAddOnClusterConfigure,
+		Message: fmt.Sprintf("cannot add %d instances to your cluster (you requested %d total instances, but your cluster currently has %d instances); only %d instances can be added at time, so reduce the sum of %s across all nodegroups by %d", requestedNodes-currentNodes, requestedNodes, currentNodes, maxNodes, MinInstancesKey, requestedNodes-currentNodes-maxNodes),
+	})
+}
+
 func ErrorInstanceTypeTooSmall(instanceType string) error {
 	return errors.WithStack(&errors.Error{
 		Kind:    ErrInstanceTypeTooSmall,