Skip to content

Commit 83ad1d4

Browse files
author
Miguel Varela Ramos
authored
Cluster health command (#2313)
1 parent 0375fe1 commit 83ad1d4

File tree

7 files changed

+551
-30
lines changed

7 files changed

+551
-30
lines changed

cli/cmd/cluster.go

Lines changed: 160 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -17,28 +17,33 @@ limitations under the License.
1717
package cmd
1818

1919
import (
20+
"encoding/base64"
2021
"fmt"
2122
"os"
2223
"path/filepath"
2324
"regexp"
2425
"strings"
2526
"time"
2627

28+
"github.com/aws/aws-sdk-go/aws"
2729
"github.com/aws/aws-sdk-go/service/autoscaling"
2830
"github.com/aws/aws-sdk-go/service/ec2"
31+
"github.com/aws/aws-sdk-go/service/eks"
2932
"github.com/aws/aws-sdk-go/service/elbv2"
3033
"github.com/aws/aws-sdk-go/service/s3"
3134
"github.com/cortexlabs/cortex/cli/cluster"
3235
"github.com/cortexlabs/cortex/cli/types/cliconfig"
3336
"github.com/cortexlabs/cortex/cli/types/flags"
3437
"github.com/cortexlabs/cortex/pkg/consts"
35-
"github.com/cortexlabs/cortex/pkg/lib/aws"
38+
"github.com/cortexlabs/cortex/pkg/health"
39+
awslib "github.com/cortexlabs/cortex/pkg/lib/aws"
3640
"github.com/cortexlabs/cortex/pkg/lib/console"
3741
"github.com/cortexlabs/cortex/pkg/lib/docker"
3842
"github.com/cortexlabs/cortex/pkg/lib/errors"
3943
"github.com/cortexlabs/cortex/pkg/lib/exit"
4044
"github.com/cortexlabs/cortex/pkg/lib/files"
4145
libjson "github.com/cortexlabs/cortex/pkg/lib/json"
46+
"github.com/cortexlabs/cortex/pkg/lib/k8s"
4247
libmath "github.com/cortexlabs/cortex/pkg/lib/math"
4348
"github.com/cortexlabs/cortex/pkg/lib/pointer"
4449
"github.com/cortexlabs/cortex/pkg/lib/prompt"
@@ -51,6 +56,10 @@ import (
5156
"github.com/cortexlabs/cortex/pkg/types/clusterstate"
5257
"github.com/cortexlabs/yaml"
5358
"github.com/spf13/cobra"
59+
"k8s.io/apimachinery/pkg/runtime"
60+
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
61+
"k8s.io/client-go/rest"
62+
"sigs.k8s.io/aws-iam-authenticator/pkg/token"
5463
)
5564

5665
var (
@@ -101,11 +110,21 @@ func clusterInit() {
101110
addClusterNameFlag(_clusterExportCmd)
102111
addClusterRegionFlag(_clusterExportCmd)
103112
_clusterCmd.AddCommand(_clusterExportCmd)
113+
114+
_clusterHealthCmd.Flags().SortFlags = false
115+
addClusterConfigFlag(_clusterHealthCmd)
116+
addClusterNameFlag(_clusterHealthCmd)
117+
addClusterRegionFlag(_clusterHealthCmd)
118+
_clusterHealthCmd.Flags().VarP(&_flagOutput, "output", "o", fmt.Sprintf("output format: one of %s", strings.Join(flags.OutputTypeStringsExcluding(flags.YAMLOutputType), "|")))
119+
_clusterCmd.AddCommand(_clusterHealthCmd)
104120
}
105121

106122
func addClusterConfigFlag(cmd *cobra.Command) {
107123
cmd.Flags().StringVarP(&_flagClusterConfig, "config", "c", "", "path to a cluster configuration file")
108-
cmd.Flags().SetAnnotation("config", cobra.BashCompFilenameExt, _configFileExts)
124+
err := cmd.Flags().SetAnnotation("config", cobra.BashCompFilenameExt, _configFileExts)
125+
if err != nil {
126+
exit.Error(err) // should never happen
127+
}
109128
}
110129

111130
func addClusterNameFlag(cmd *cobra.Command) {
@@ -631,8 +650,8 @@ var _clusterDownCmd = &cobra.Command{
631650
}
632651

633652
// best-effort deletion of cached config
634-
cachedClusterConfigPath := cachedClusterConfigPath(accessConfig.ClusterName, accessConfig.Region)
635-
os.Remove(cachedClusterConfigPath)
653+
cachedClusterConfigPath := getCachedClusterConfigPath(accessConfig.ClusterName, accessConfig.Region)
654+
_ = os.Remove(cachedClusterConfigPath)
636655

637656
if len(errorsList) > 0 {
638657
exit.Error(errors.ListOfErrors(ErrClusterDown, false, errorsList...))
@@ -743,7 +762,83 @@ var _clusterExportCmd = &cobra.Command{
743762
},
744763
}
745764

746-
func cmdInfo(awsClient *aws.Client, accessConfig *clusterconfig.AccessConfig, stacks clusterstate.ClusterStacks, printConfig bool, outputType flags.OutputType, disallowPrompt bool) {
765+
var _clusterHealthCmd = &cobra.Command{
766+
Use: "health",
767+
Short: "inspect the health of components in the cluster",
768+
Args: cobra.NoArgs,
769+
Run: func(cmd *cobra.Command, args []string) {
770+
accessConfig, err := getClusterAccessConfigWithCache(true)
771+
if err != nil {
772+
exit.Error(err)
773+
}
774+
775+
awsClient, err := awslib.NewForRegion(accessConfig.Region)
776+
if err != nil {
777+
exit.Error(err)
778+
}
779+
780+
restConfig, err := getClusterRESTConfig(awsClient, accessConfig.ClusterName)
781+
if err != nil {
782+
exit.Error(err)
783+
}
784+
785+
scheme := runtime.NewScheme()
786+
if err := clientgoscheme.AddToScheme(scheme); err != nil {
787+
exit.Error(err)
788+
}
789+
790+
k8sClient, err := k8s.New("default", false, restConfig, scheme)
791+
if err != nil {
792+
exit.Error(err)
793+
}
794+
795+
clusterHealth, err := health.Check(awsClient, k8sClient, accessConfig.ClusterName)
796+
if err != nil {
797+
exit.Error(err)
798+
}
799+
800+
clusterWarnings, err := health.GetWarnings(k8sClient)
801+
if err != nil {
802+
exit.Error(err)
803+
}
804+
805+
if _flagOutput == flags.JSONOutputType {
806+
fmt.Println(clusterHealth)
807+
return
808+
}
809+
810+
healthTable := table.Table{
811+
Headers: []table.Header{
812+
{Title: ""},
813+
{Title: "live"},
814+
{Title: "warning", Hidden: !clusterWarnings.HasWarnings()},
815+
},
816+
Rows: [][]interface{}{
817+
{"operator", console.BoolColor(clusterHealth.Operator), ""},
818+
{"prometheus", console.BoolColor(clusterHealth.Prometheus), clusterWarnings.Prometheus},
819+
{"autoscaler", console.BoolColor(clusterHealth.Autoscaler), ""},
820+
{"activator", console.BoolColor(clusterHealth.Activator), ""},
821+
{"grafana", console.BoolColor(clusterHealth.Grafana), ""},
822+
{"controller manager", console.BoolColor(clusterHealth.ControllerManager), ""},
823+
{"apis gateway", console.BoolColor(clusterHealth.APIsGateway), ""},
824+
{"operator gateway", console.BoolColor(clusterHealth.APIsGateway), ""},
825+
{"cluster autoscaler", console.BoolColor(clusterHealth.ClusterAutoscaler), ""},
826+
{"operator load balancer", console.BoolColor(clusterHealth.OperatorLoadBalancer), ""},
827+
{"apis load balancer", console.BoolColor(clusterHealth.APIsLoadBalancer), ""},
828+
{"fluent bit", console.BoolColor(clusterHealth.FluentBit), ""},
829+
{"node exporter", console.BoolColor(clusterHealth.NodeExporter), ""},
830+
{"dcgm exporter", console.BoolColor(clusterHealth.DCGMExporter), ""},
831+
{"statsd exporter", console.BoolColor(clusterHealth.StatsDExporter), ""},
832+
{"event exporter", console.BoolColor(clusterHealth.EventExporter), ""},
833+
{"kube state metrics", console.BoolColor(clusterHealth.KubeStateMetrics), ""},
834+
},
835+
}
836+
837+
fmt.Println(healthTable.MustFormat())
838+
},
839+
}
840+
841+
func cmdInfo(awsClient *awslib.Client, accessConfig *clusterconfig.AccessConfig, stacks clusterstate.ClusterStacks, printConfig bool, outputType flags.OutputType, disallowPrompt bool) {
747842
clusterConfig := refreshCachedClusterConfig(awsClient, accessConfig, outputType == flags.PrettyOutputType)
748843

749844
operatorLoadBalancer, err := getLoadBalancer(accessConfig.ClusterName, OperatorLoadBalancer, awsClient)
@@ -846,14 +941,14 @@ func getInfoOperatorResponse(operatorEndpoint string) (*schema.InfoResponse, err
846941
}
847942

848943
func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterconfig.Config) {
849-
eksPrice := aws.EKSPrices[clusterConfig.Region]
850-
operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price
851-
operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
852-
prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region][clusterConfig.PrometheusInstanceType].Price
853-
prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
854-
metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24
855-
nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price
856-
natUnitPrice := aws.NATMetadatas[clusterConfig.Region].Price
944+
eksPrice := awslib.EKSPrices[clusterConfig.Region]
945+
operatorInstancePrice := awslib.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price
946+
operatorEBSPrice := awslib.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
947+
prometheusInstancePrice := awslib.InstanceMetadatas[clusterConfig.Region][clusterConfig.PrometheusInstanceType].Price
948+
prometheusEBSPrice := awslib.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
949+
metricsEBSPrice := awslib.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24
950+
nlbPrice := awslib.NLBMetadatas[clusterConfig.Region].Price
951+
natUnitPrice := awslib.NATMetadatas[clusterConfig.Region].Price
857952

858953
headers := []table.Header{
859954
{Title: "aws resource"},
@@ -874,13 +969,13 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
874969
nodesInfo := infoResponse.GetNodesWithNodeGroupName(ngNamePrefix + ng.Name)
875970
numInstances := len(nodesInfo)
876971

877-
ebsPrice := aws.EBSMetadatas[clusterConfig.Region][ng.InstanceVolumeType.String()].PriceGB * float64(ng.InstanceVolumeSize) / 30 / 24
972+
ebsPrice := awslib.EBSMetadatas[clusterConfig.Region][ng.InstanceVolumeType.String()].PriceGB * float64(ng.InstanceVolumeSize) / 30 / 24
878973
if ng.InstanceVolumeType == clusterconfig.IO1VolumeType && ng.InstanceVolumeIOPS != nil {
879-
ebsPrice += aws.EBSMetadatas[clusterConfig.Region][ng.InstanceVolumeType.String()].PriceIOPS * float64(*ng.InstanceVolumeIOPS) / 30 / 24
974+
ebsPrice += awslib.EBSMetadatas[clusterConfig.Region][ng.InstanceVolumeType.String()].PriceIOPS * float64(*ng.InstanceVolumeIOPS) / 30 / 24
880975
}
881976
if ng.InstanceVolumeType == clusterconfig.GP3VolumeType && ng.InstanceVolumeIOPS != nil && ng.InstanceVolumeThroughput != nil {
882-
ebsPrice += libmath.MaxFloat64(0, (aws.EBSMetadatas[clusterConfig.Region][ng.InstanceVolumeType.String()].PriceIOPS-3000)*float64(*ng.InstanceVolumeIOPS)/30/24)
883-
ebsPrice += libmath.MaxFloat64(0, (aws.EBSMetadatas[clusterConfig.Region][ng.InstanceVolumeType.String()].PriceThroughput-125)*float64(*ng.InstanceVolumeThroughput)/30/24)
977+
ebsPrice += libmath.MaxFloat64(0, (awslib.EBSMetadatas[clusterConfig.Region][ng.InstanceVolumeType.String()].PriceIOPS-3000)*float64(*ng.InstanceVolumeIOPS)/30/24)
978+
ebsPrice += libmath.MaxFloat64(0, (awslib.EBSMetadatas[clusterConfig.Region][ng.InstanceVolumeType.String()].PriceThroughput-125)*float64(*ng.InstanceVolumeThroughput)/30/24)
884979
}
885980
totalEBSPrice := ebsPrice * float64(numInstances)
886981

@@ -1040,7 +1135,7 @@ func updateCLIEnv(envName string, operatorEndpoint string, disallowPrompt bool,
10401135
return nil
10411136
}
10421137

1043-
func cmdDebug(awsClient *aws.Client, accessConfig *clusterconfig.AccessConfig) {
1138+
func cmdDebug(awsClient *awslib.Client, accessConfig *clusterconfig.AccessConfig) {
10441139
// note: if modifying this string, also change it in files.IgnoreCortexDebug()
10451140
debugFileName := fmt.Sprintf("cortex-debug-%s.tgz", time.Now().UTC().Format("2006-01-02-15-04-05"))
10461141

@@ -1064,9 +1159,9 @@ func cmdDebug(awsClient *aws.Client, accessConfig *clusterconfig.AccessConfig) {
10641159
return
10651160
}
10661161

1067-
func refreshCachedClusterConfig(awsClient *aws.Client, accessConfig *clusterconfig.AccessConfig, printToStdout bool) clusterconfig.Config {
1162+
func refreshCachedClusterConfig(awsClient *awslib.Client, accessConfig *clusterconfig.AccessConfig, printToStdout bool) clusterconfig.Config {
10681163
// add empty file if cached cluster doesn't exist so that the file output by manager container maintains current user permissions
1069-
cachedClusterConfigPath := cachedClusterConfigPath(accessConfig.ClusterName, accessConfig.Region)
1164+
cachedClusterConfigPath := getCachedClusterConfigPath(accessConfig.ClusterName, accessConfig.Region)
10701165
containerConfigPath := fmt.Sprintf("/out/%s", filepath.Base(cachedClusterConfigPath))
10711166

10721167
copyFromPaths := []dockerCopyFromPath{
@@ -1095,7 +1190,7 @@ func refreshCachedClusterConfig(awsClient *aws.Client, accessConfig *clusterconf
10951190
return *refreshedClusterConfig
10961191
}
10971192

1098-
func createS3BucketIfNotFound(awsClient *aws.Client, bucket string, tags map[string]string) error {
1193+
func createS3BucketIfNotFound(awsClient *awslib.Client, bucket string, tags map[string]string) error {
10991194
bucketFound, err := awsClient.DoesBucketExist(bucket)
11001195
if err != nil {
11011196
return err
@@ -1123,7 +1218,7 @@ func createS3BucketIfNotFound(awsClient *aws.Client, bucket string, tags map[str
11231218
fmt.Println(" ✓")
11241219
return nil
11251220
}
1126-
if !aws.IsNoSuchBucketErr(err) {
1221+
if !awslib.IsNoSuchBucketErr(err) {
11271222
break
11281223
}
11291224
time.Sleep(1 * time.Second)
@@ -1133,7 +1228,7 @@ func createS3BucketIfNotFound(awsClient *aws.Client, bucket string, tags map[str
11331228
return err
11341229
}
11351230

1136-
func setLifecycleRulesOnClusterUp(awsClient *aws.Client, bucket, newClusterUID string) error {
1231+
func setLifecycleRulesOnClusterUp(awsClient *awslib.Client, bucket, newClusterUID string) error {
11371232
err := awsClient.DeleteLifecycleRules(bucket)
11381233
if err != nil {
11391234
return err
@@ -1177,7 +1272,7 @@ func setLifecycleRulesOnClusterUp(awsClient *aws.Client, bucket, newClusterUID s
11771272
return awsClient.SetLifecycleRules(bucket, rules)
11781273
}
11791274

1180-
func setLifecycleRulesOnClusterDown(awsClient *aws.Client, bucket string) error {
1275+
func setLifecycleRulesOnClusterDown(awsClient *awslib.Client, bucket string) error {
11811276
err := awsClient.DeleteLifecycleRules(bucket)
11821277
if err != nil {
11831278
return err
@@ -1198,7 +1293,7 @@ func setLifecycleRulesOnClusterDown(awsClient *aws.Client, bucket string) error
11981293
})
11991294
}
12001295

1201-
func createLogGroupIfNotFound(awsClient *aws.Client, logGroup string, tags map[string]string) error {
1296+
func createLogGroupIfNotFound(awsClient *awslib.Client, logGroup string, tags map[string]string) error {
12021297
logGroupFound, err := awsClient.DoesLogGroupExist(logGroup)
12031298
if err != nil {
12041299
return err
@@ -1240,7 +1335,7 @@ func (lb LoadBalancer) String() string {
12401335
}
12411336

12421337
// Will return error if the load balancer can't be found
1243-
func getLoadBalancer(clusterName string, whichLB LoadBalancer, awsClient *aws.Client) (*elbv2.LoadBalancer, error) {
1338+
func getLoadBalancer(clusterName string, whichLB LoadBalancer, awsClient *awslib.Client) (*elbv2.LoadBalancer, error) {
12441339
loadBalancer, err := awsClient.FindLoadBalancer(map[string]string{
12451340
clusterconfig.ClusterNameTag: clusterName,
12461341
"cortex.dev/load-balancer": whichLB.String(),
@@ -1256,7 +1351,7 @@ func getLoadBalancer(clusterName string, whichLB LoadBalancer, awsClient *aws.Cl
12561351
return loadBalancer, nil
12571352
}
12581353

1259-
func listPVCVolumesForCluster(awsClient *aws.Client, clusterName string) ([]ec2.Volume, error) {
1354+
func listPVCVolumesForCluster(awsClient *awslib.Client, clusterName string) ([]ec2.Volume, error) {
12601355
return awsClient.ListVolumes(ec2.Tag{
12611356
Key: pointer.String(fmt.Sprintf("kubernetes.io/cluster/%s", clusterName)),
12621357
Value: nil, // any value should be ok as long as the key is present
@@ -1266,3 +1361,41 @@ func listPVCVolumesForCluster(awsClient *aws.Client, clusterName string) ([]ec2.
12661361
func filterEKSCTLOutput(out string) string {
12671362
return strings.Join(s.RemoveDuplicates(strings.Split(out, "\n"), _eksctlPrefixRegex), "\n")
12681363
}
1364+
1365+
func getClusterRESTConfig(awsClient *awslib.Client, clusterName string) (*rest.Config, error) {
1366+
clusterOutput, err := awsClient.EKS().DescribeCluster(
1367+
&eks.DescribeClusterInput{
1368+
Name: aws.String(clusterName),
1369+
},
1370+
)
1371+
if err != nil {
1372+
return nil, err
1373+
}
1374+
1375+
gen, err := token.NewGenerator(true, false)
1376+
if err != nil {
1377+
return nil, err
1378+
}
1379+
1380+
opts := &token.GetTokenOptions{
1381+
ClusterID: aws.StringValue(clusterOutput.Cluster.Name),
1382+
}
1383+
1384+
tok, err := gen.GetWithOptions(opts)
1385+
if err != nil {
1386+
return nil, err
1387+
}
1388+
1389+
ca, err := base64.StdEncoding.DecodeString(aws.StringValue(clusterOutput.Cluster.CertificateAuthority.Data))
1390+
if err != nil {
1391+
return nil, err
1392+
}
1393+
1394+
return &rest.Config{
1395+
Host: aws.StringValue(clusterOutput.Cluster.Endpoint),
1396+
BearerToken: tok.Token,
1397+
TLSClientConfig: rest.TLSClientConfig{
1398+
CAData: ca,
1399+
},
1400+
}, nil
1401+
}

cli/cmd/lib_cluster_config.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ import (
3939

4040
var _cachedClusterConfigRegex = regexp.MustCompile(`^cluster_\S+\.yaml$`)
4141

42-
func cachedClusterConfigPath(clusterName string, region string) string {
42+
func getCachedClusterConfigPath(clusterName string, region string) string {
4343
return filepath.Join(_localDir, fmt.Sprintf("cluster_%s_%s.yaml", clusterName, region))
4444
}
4545

cli/cmd/lib_manager.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ func runManager(containerConfig *container.Config, addNewLineAfterPull bool, cop
8181
}
8282

8383
removeContainer := func() {
84-
dockerClient.ContainerRemove(context.Background(), containerInfo.ID, dockertypes.ContainerRemoveOptions{
84+
_ = dockerClient.ContainerRemove(context.Background(), containerInfo.ID, dockertypes.ContainerRemoveOptions{
8585
RemoveVolumes: true,
8686
Force: true,
8787
})
@@ -166,7 +166,7 @@ func runManagerWithClusterConfig(entrypoint string, clusterConfig *clusterconfig
166166
return "", nil, errors.WithStack(err)
167167
}
168168

169-
cachedClusterConfigPath := cachedClusterConfigPath(clusterConfig.ClusterName, clusterConfig.Region)
169+
cachedClusterConfigPath := getCachedClusterConfigPath(clusterConfig.ClusterName, clusterConfig.Region)
170170
if err := files.WriteFile(clusterConfigBytes, cachedClusterConfigPath); err != nil {
171171
return "", nil, err
172172
}

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,9 @@ require (
7979
k8s.io/client-go v0.20.8
8080
k8s.io/klog/v2 v2.9.0 // indirect
8181
k8s.io/kube-openapi v0.0.0-20210527164424-3c818078ee3d // indirect
82+
k8s.io/metrics v0.20.8
8283
k8s.io/utils v0.0.0-20210629042839-4a2b36d8d73f // indirect
84+
sigs.k8s.io/aws-iam-authenticator v0.5.3
8385
sigs.k8s.io/controller-runtime v0.8.3
8486
sigs.k8s.io/structured-merge-diff/v4 v4.1.2 // indirect
8587
)

0 commit comments

Comments
 (0)