Skip to content

Fix EKS control plane getting overloaded when adding nodes to the cluster #2331

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Jul 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,9 @@ Add this to your bash profile (e.g. `~/.bash_profile`, `~/.profile` or `~/.bashr
# set the default image registry
export CORTEX_DEV_DEFAULT_IMAGE_REGISTRY="<account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs"

# enable api server monitoring in grafana
export CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD="true"

# redirect analytics and error reporting to our dev environment
export CORTEX_TELEMETRY_SENTRY_DSN="https://[email protected]/1848098"
export CORTEX_TELEMETRY_SEGMENT_WRITE_KEY="0WvoJyCey9z1W2EW7rYTPJUMRYat46dl"
Expand Down
17 changes: 16 additions & 1 deletion cli/cmd/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,21 @@ var _clusterConfigureCmd = &cobra.Command{
exit.Error(err)
}

restConfig, err := getClusterRESTConfig(awsClient, accessConfig.ClusterName)
if err != nil {
exit.Error(err)
}

scheme := runtime.NewScheme()
if err := clientgoscheme.AddToScheme(scheme); err != nil {
exit.Error(err)
}

k8sClient, err := k8s.New("default", false, restConfig, scheme)
if err != nil {
exit.Error(err)
}

stacks, err := clusterstate.GetClusterStacks(awsClient, accessConfig)
if err != nil {
exit.Error(err)
Expand All @@ -362,7 +377,7 @@ var _clusterConfigureCmd = &cobra.Command{

promptIfNotAdmin(awsClient, _flagClusterDisallowPrompt)

newClusterConfig, configureChanges, err := getConfigureClusterConfig(awsClient, stacks, oldClusterConfig, clusterConfigFile)
newClusterConfig, configureChanges, err := getConfigureClusterConfig(awsClient, k8sClient, stacks, oldClusterConfig, clusterConfigFile)
if err != nil {
exit.Error(err)
}
Expand Down
5 changes: 3 additions & 2 deletions cli/cmd/lib_cluster_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
cr "github.com/cortexlabs/cortex/pkg/lib/configreader"
"github.com/cortexlabs/cortex/pkg/lib/errors"
"github.com/cortexlabs/cortex/pkg/lib/files"
"github.com/cortexlabs/cortex/pkg/lib/k8s"
"github.com/cortexlabs/cortex/pkg/lib/maps"
libmath "github.com/cortexlabs/cortex/pkg/lib/math"
"github.com/cortexlabs/cortex/pkg/lib/pointer"
Expand Down Expand Up @@ -140,7 +141,7 @@ func getInstallClusterConfig(awsClient *aws.Client, clusterConfigFile string) (*
return clusterConfig, nil
}

func getConfigureClusterConfig(awsClient *aws.Client, stacks clusterstate.ClusterStacks, cachedClusterConfig clusterconfig.Config, newClusterConfigFile string) (*clusterconfig.Config, clusterconfig.ConfigureChanges, error) {
func getConfigureClusterConfig(awsClient *aws.Client, k8sClient *k8s.Client, stacks clusterstate.ClusterStacks, cachedClusterConfig clusterconfig.Config, newClusterConfigFile string) (*clusterconfig.Config, clusterconfig.ConfigureChanges, error) {
newUserClusterConfig := &clusterconfig.Config{}

err := readUserClusterConfigFile(newUserClusterConfig, newClusterConfigFile)
Expand All @@ -151,7 +152,7 @@ func getConfigureClusterConfig(awsClient *aws.Client, stacks clusterstate.Cluste
newUserClusterConfig.Telemetry = isTelemetryEnabled()
cachedClusterConfig.Telemetry = newUserClusterConfig.Telemetry

configureChanges, err := newUserClusterConfig.ValidateOnConfigure(awsClient, cachedClusterConfig, stacks.NodeGroupsStacks)
configureChanges, err := newUserClusterConfig.ValidateOnConfigure(awsClient, k8sClient, cachedClusterConfig, stacks.NodeGroupsStacks)
if err != nil {
err = errors.Append(err, fmt.Sprintf("\n\ncluster configuration schema can be found at https://docs.cortex.dev/v/%s/", consts.CortexVersionMinor))
return nil, clusterconfig.ConfigureChanges{}, errors.Wrap(err, newClusterConfigFile)
Expand Down
1 change: 1 addition & 0 deletions cli/cmd/lib_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ func runManagerWithClusterConfig(entrypoint string, clusterConfig *clusterconfig
"CORTEX_TELEMETRY_SENTRY_DSN=" + os.Getenv("CORTEX_TELEMETRY_SENTRY_DSN"),
"CORTEX_TELEMETRY_SEGMENT_WRITE_KEY=" + os.Getenv("CORTEX_TELEMETRY_SEGMENT_WRITE_KEY"),
"CORTEX_DEV_DEFAULT_IMAGE_REGISTRY=" + os.Getenv("CORTEX_DEV_DEFAULT_IMAGE_REGISTRY"),
"CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD=" + os.Getenv("CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD"),
"CORTEX_CLUSTER_CONFIG_FILE=" + containerClusterConfigPath,
}
envs = append(envs, extraEnvs...)
Expand Down
7 changes: 5 additions & 2 deletions dev/versions.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,14 @@ see https://github.com/moby/moby/issues/39302#issuecomment-639687466_

1. Find the latest patch release for our current version of k8s (e.g. k8s v1.17 -> cluster-autocluster v1.17.3)
on [GitHub](https://github.com/kubernetes/autoscaler/releases) and check the changelog
1. Update the base image in `images/cluster-autoscaler/Dockerfile` to the repository URL shown in the GitHub release
1. In the [GitHub Repo](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws),
set the tree to the tag for the chosen release, and open `cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml`
(e.g. <https://github.com/kubernetes/autoscaler/blob/cluster-autoscaler-1.20.0/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml>)
1. Resolve merge conflicts with the template in `manager/manifests/cluster-autoscaler.yaml.j2`
1. Resolve merge conflicts with the template in `manager/manifests/cluster-autoscaler.yaml.j2`.
1. Pull the release branch from the upstream repo to Cortex's fork on [Github](https://github.com/cortexlabs/autoscaler).
1. Apply the rate-limiter changes from the previous version to the new one (currently sitting on `cluster-autoscaler-release-1.20` branch).
1. Update `-b` flag's value from `git clone` command in `images/cluster-autoscaler/Dockerfile` to the branch name of the latest release from Cortex's fork.
1. Match the Go version of the builder in `images/cluster-autoscaler/Dockerfile` with that of the [cluster autoscaler](https://github.com/kubernetes/autoscaler)'s Dockerfile.

## FluentBit

Expand Down
12 changes: 11 additions & 1 deletion images/cluster-autoscaler/Dockerfile
Original file line number Diff line number Diff line change
@@ -1 +1,11 @@
FROM k8s.gcr.io/autoscaling/cluster-autoscaler:v1.20.0
ARG TARGETARCH, TARGETOS

FROM golang:1.15 AS builder
RUN git clone -b cluster-autoscaler-release-1.20 --depth 1 https://github.com/cortexlabs/autoscaler /k8s.io/autoscaler
WORKDIR /k8s.io/autoscaler/cluster-autoscaler
RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build --installsuffix cgo -o cluster-autoscaler k8s.io/autoscaler/cluster-autoscaler \
&& cp cluster-autoscaler /usr/local/bin

FROM alpine:3.8
RUN apk add -U --no-cache ca-certificates && rm -rf /var/cache/apk/*
COPY --from=builder /usr/local/bin/cluster-autoscaler .
9 changes: 7 additions & 2 deletions manager/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,9 @@ function setup_prometheus() {
envsubst < manifests/prometheus-kubelet-exporter.yaml | kubectl apply -f - >/dev/null
envsubst < manifests/prometheus-kube-state-metrics.yaml | kubectl apply -f - >/dev/null
envsubst < manifests/prometheus-node-exporter.yaml | kubectl apply -f - >/dev/null
python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/prometheus-monitoring.yaml.j2 | kubectl apply -f - >/dev/null
envsubst < manifests/prometheus-monitoring.yaml | kubectl apply -f - >/dev/null
python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/prometheus-additional-scrape-configs.yaml.j2 > prometheus-additional-scrape-configs.yaml
kubectl create secret generic additional-scrape-configs --from-file=prometheus-additional-scrape-configs.yaml
}

function setup_grafana() {
Expand All @@ -235,7 +237,10 @@ function setup_grafana() {
kubectl apply -f manifests/grafana/grafana-dashboard-task.yaml >/dev/null
kubectl apply -f manifests/grafana/grafana-dashboard-cluster.yaml >/dev/null
kubectl apply -f manifests/grafana/grafana-dashboard-nodes.yaml >/dev/null
envsubst < manifests/grafana/grafana.yaml | kubectl apply -f - >/dev/null
if [ "$CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD" = "true" ]; then
kubectl apply -f manifests/grafana/grafana-dashboard-control-plane.yaml >/dev/null
fi
python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/grafana/grafana.yaml.j2 | kubectl apply -f - >/dev/null
}

function restart_operator() {
Expand Down
3 changes: 3 additions & 0 deletions manager/manifests/cluster-autoscaler.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,9 @@ spec:
- --ok-total-unready-count=30
- --max-node-provision-time=8m
- --scan-interval=20s
- --scale-up-rate-limit-enabled=true
- --scale-up-max-number-nodes-per-min=50
- --scale-up-burst-number-nodes-per-min=75
- --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/{{ config['cluster_name'] }}
volumeMounts:
- name: ssl-certs
Expand Down
Loading