@@ -17,28 +17,33 @@ limitations under the License.
17
17
package cmd
18
18
19
19
import (
20
+ "encoding/base64"
20
21
"fmt"
21
22
"os"
22
23
"path/filepath"
23
24
"regexp"
24
25
"strings"
25
26
"time"
26
27
28
+ "github.com/aws/aws-sdk-go/aws"
27
29
"github.com/aws/aws-sdk-go/service/autoscaling"
28
30
"github.com/aws/aws-sdk-go/service/ec2"
31
+ "github.com/aws/aws-sdk-go/service/eks"
29
32
"github.com/aws/aws-sdk-go/service/elbv2"
30
33
"github.com/aws/aws-sdk-go/service/s3"
31
34
"github.com/cortexlabs/cortex/cli/cluster"
32
35
"github.com/cortexlabs/cortex/cli/types/cliconfig"
33
36
"github.com/cortexlabs/cortex/cli/types/flags"
34
37
"github.com/cortexlabs/cortex/pkg/consts"
35
- "github.com/cortexlabs/cortex/pkg/lib/aws"
38
+ "github.com/cortexlabs/cortex/pkg/health"
39
+ awslib "github.com/cortexlabs/cortex/pkg/lib/aws"
36
40
"github.com/cortexlabs/cortex/pkg/lib/console"
37
41
"github.com/cortexlabs/cortex/pkg/lib/docker"
38
42
"github.com/cortexlabs/cortex/pkg/lib/errors"
39
43
"github.com/cortexlabs/cortex/pkg/lib/exit"
40
44
"github.com/cortexlabs/cortex/pkg/lib/files"
41
45
libjson "github.com/cortexlabs/cortex/pkg/lib/json"
46
+ "github.com/cortexlabs/cortex/pkg/lib/k8s"
42
47
libmath "github.com/cortexlabs/cortex/pkg/lib/math"
43
48
"github.com/cortexlabs/cortex/pkg/lib/pointer"
44
49
"github.com/cortexlabs/cortex/pkg/lib/prompt"
@@ -51,6 +56,10 @@ import (
51
56
"github.com/cortexlabs/cortex/pkg/types/clusterstate"
52
57
"github.com/cortexlabs/yaml"
53
58
"github.com/spf13/cobra"
59
+ "k8s.io/apimachinery/pkg/runtime"
60
+ clientgoscheme "k8s.io/client-go/kubernetes/scheme"
61
+ "k8s.io/client-go/rest"
62
+ "sigs.k8s.io/aws-iam-authenticator/pkg/token"
54
63
)
55
64
56
65
var (
@@ -101,11 +110,21 @@ func clusterInit() {
101
110
addClusterNameFlag (_clusterExportCmd )
102
111
addClusterRegionFlag (_clusterExportCmd )
103
112
_clusterCmd .AddCommand (_clusterExportCmd )
113
+
114
+ _clusterHealthCmd .Flags ().SortFlags = false
115
+ addClusterConfigFlag (_clusterHealthCmd )
116
+ addClusterNameFlag (_clusterHealthCmd )
117
+ addClusterRegionFlag (_clusterHealthCmd )
118
+ _clusterHealthCmd .Flags ().VarP (& _flagOutput , "output" , "o" , fmt .Sprintf ("output format: one of %s" , strings .Join (flags .OutputTypeStringsExcluding (flags .YAMLOutputType ), "|" )))
119
+ _clusterCmd .AddCommand (_clusterHealthCmd )
104
120
}
105
121
106
122
func addClusterConfigFlag (cmd * cobra.Command ) {
107
123
cmd .Flags ().StringVarP (& _flagClusterConfig , "config" , "c" , "" , "path to a cluster configuration file" )
108
- cmd .Flags ().SetAnnotation ("config" , cobra .BashCompFilenameExt , _configFileExts )
124
+ err := cmd .Flags ().SetAnnotation ("config" , cobra .BashCompFilenameExt , _configFileExts )
125
+ if err != nil {
126
+ exit .Error (err ) // should never happen
127
+ }
109
128
}
110
129
111
130
func addClusterNameFlag (cmd * cobra.Command ) {
@@ -631,8 +650,8 @@ var _clusterDownCmd = &cobra.Command{
631
650
}
632
651
633
652
// best-effort deletion of cached config
634
- cachedClusterConfigPath := cachedClusterConfigPath (accessConfig .ClusterName , accessConfig .Region )
635
- os .Remove (cachedClusterConfigPath )
653
+ cachedClusterConfigPath := getCachedClusterConfigPath (accessConfig .ClusterName , accessConfig .Region )
654
+ _ = os .Remove (cachedClusterConfigPath )
636
655
637
656
if len (errorsList ) > 0 {
638
657
exit .Error (errors .ListOfErrors (ErrClusterDown , false , errorsList ... ))
@@ -743,7 +762,83 @@ var _clusterExportCmd = &cobra.Command{
743
762
},
744
763
}
745
764
746
- func cmdInfo (awsClient * aws.Client , accessConfig * clusterconfig.AccessConfig , stacks clusterstate.ClusterStacks , printConfig bool , outputType flags.OutputType , disallowPrompt bool ) {
765
+ var _clusterHealthCmd = & cobra.Command {
766
+ Use : "health" ,
767
+ Short : "inspect the health of components in the cluster" ,
768
+ Args : cobra .NoArgs ,
769
+ Run : func (cmd * cobra.Command , args []string ) {
770
+ accessConfig , err := getClusterAccessConfigWithCache (true )
771
+ if err != nil {
772
+ exit .Error (err )
773
+ }
774
+
775
+ awsClient , err := awslib .NewForRegion (accessConfig .Region )
776
+ if err != nil {
777
+ exit .Error (err )
778
+ }
779
+
780
+ restConfig , err := getClusterRESTConfig (awsClient , accessConfig .ClusterName )
781
+ if err != nil {
782
+ exit .Error (err )
783
+ }
784
+
785
+ scheme := runtime .NewScheme ()
786
+ if err := clientgoscheme .AddToScheme (scheme ); err != nil {
787
+ exit .Error (err )
788
+ }
789
+
790
+ k8sClient , err := k8s .New ("default" , false , restConfig , scheme )
791
+ if err != nil {
792
+ exit .Error (err )
793
+ }
794
+
795
+ clusterHealth , err := health .Check (awsClient , k8sClient , accessConfig .ClusterName )
796
+ if err != nil {
797
+ exit .Error (err )
798
+ }
799
+
800
+ clusterWarnings , err := health .GetWarnings (k8sClient )
801
+ if err != nil {
802
+ exit .Error (err )
803
+ }
804
+
805
+ if _flagOutput == flags .JSONOutputType {
806
+ fmt .Println (clusterHealth )
807
+ return
808
+ }
809
+
810
+ healthTable := table.Table {
811
+ Headers : []table.Header {
812
+ {Title : "" },
813
+ {Title : "live" },
814
+ {Title : "warning" , Hidden : ! clusterWarnings .HasWarnings ()},
815
+ },
816
+ Rows : [][]interface {}{
817
+ {"operator" , console .BoolColor (clusterHealth .Operator ), "" },
818
+ {"prometheus" , console .BoolColor (clusterHealth .Prometheus ), clusterWarnings .Prometheus },
819
+ {"autoscaler" , console .BoolColor (clusterHealth .Autoscaler ), "" },
820
+ {"activator" , console .BoolColor (clusterHealth .Activator ), "" },
821
+ {"grafana" , console .BoolColor (clusterHealth .Grafana ), "" },
822
+ {"controller manager" , console .BoolColor (clusterHealth .ControllerManager ), "" },
823
+ {"apis gateway" , console .BoolColor (clusterHealth .APIsGateway ), "" },
824
+ {"operator gateway" , console .BoolColor (clusterHealth .APIsGateway ), "" },
825
+ {"cluster autoscaler" , console .BoolColor (clusterHealth .ClusterAutoscaler ), "" },
826
+ {"operator load balancer" , console .BoolColor (clusterHealth .OperatorLoadBalancer ), "" },
827
+ {"apis load balancer" , console .BoolColor (clusterHealth .APIsLoadBalancer ), "" },
828
+ {"fluent bit" , console .BoolColor (clusterHealth .FluentBit ), "" },
829
+ {"node exporter" , console .BoolColor (clusterHealth .NodeExporter ), "" },
830
+ {"dcgm exporter" , console .BoolColor (clusterHealth .DCGMExporter ), "" },
831
+ {"statsd exporter" , console .BoolColor (clusterHealth .StatsDExporter ), "" },
832
+ {"event exporter" , console .BoolColor (clusterHealth .EventExporter ), "" },
833
+ {"kube state metrics" , console .BoolColor (clusterHealth .KubeStateMetrics ), "" },
834
+ },
835
+ }
836
+
837
+ fmt .Println (healthTable .MustFormat ())
838
+ },
839
+ }
840
+
841
+ func cmdInfo (awsClient * awslib.Client , accessConfig * clusterconfig.AccessConfig , stacks clusterstate.ClusterStacks , printConfig bool , outputType flags.OutputType , disallowPrompt bool ) {
747
842
clusterConfig := refreshCachedClusterConfig (awsClient , accessConfig , outputType == flags .PrettyOutputType )
748
843
749
844
operatorLoadBalancer , err := getLoadBalancer (accessConfig .ClusterName , OperatorLoadBalancer , awsClient )
@@ -846,14 +941,14 @@ func getInfoOperatorResponse(operatorEndpoint string) (*schema.InfoResponse, err
846
941
}
847
942
848
943
func printInfoPricing (infoResponse * schema.InfoResponse , clusterConfig clusterconfig.Config ) {
849
- eksPrice := aws .EKSPrices [clusterConfig .Region ]
850
- operatorInstancePrice := aws .InstanceMetadatas [clusterConfig .Region ]["t3.medium" ].Price
851
- operatorEBSPrice := aws .EBSMetadatas [clusterConfig .Region ]["gp3" ].PriceGB * 20 / 30 / 24
852
- prometheusInstancePrice := aws .InstanceMetadatas [clusterConfig.Region ][clusterConfig.PrometheusInstanceType ].Price
853
- prometheusEBSPrice := aws .EBSMetadatas [clusterConfig .Region ]["gp3" ].PriceGB * 20 / 30 / 24
854
- metricsEBSPrice := aws .EBSMetadatas [clusterConfig .Region ]["gp2" ].PriceGB * (40 + 2 ) / 30 / 24
855
- nlbPrice := aws .NLBMetadatas [clusterConfig .Region ].Price
856
- natUnitPrice := aws .NATMetadatas [clusterConfig .Region ].Price
944
+ eksPrice := awslib .EKSPrices [clusterConfig .Region ]
945
+ operatorInstancePrice := awslib .InstanceMetadatas [clusterConfig .Region ]["t3.medium" ].Price
946
+ operatorEBSPrice := awslib .EBSMetadatas [clusterConfig .Region ]["gp3" ].PriceGB * 20 / 30 / 24
947
+ prometheusInstancePrice := awslib .InstanceMetadatas [clusterConfig.Region ][clusterConfig.PrometheusInstanceType ].Price
948
+ prometheusEBSPrice := awslib .EBSMetadatas [clusterConfig .Region ]["gp3" ].PriceGB * 20 / 30 / 24
949
+ metricsEBSPrice := awslib .EBSMetadatas [clusterConfig .Region ]["gp2" ].PriceGB * (40 + 2 ) / 30 / 24
950
+ nlbPrice := awslib .NLBMetadatas [clusterConfig .Region ].Price
951
+ natUnitPrice := awslib .NATMetadatas [clusterConfig .Region ].Price
857
952
858
953
headers := []table.Header {
859
954
{Title : "aws resource" },
@@ -874,13 +969,13 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
874
969
nodesInfo := infoResponse .GetNodesWithNodeGroupName (ngNamePrefix + ng .Name )
875
970
numInstances := len (nodesInfo )
876
971
877
- ebsPrice := aws .EBSMetadatas [clusterConfig .Region ][ng .InstanceVolumeType .String ()].PriceGB * float64 (ng .InstanceVolumeSize ) / 30 / 24
972
+ ebsPrice := awslib .EBSMetadatas [clusterConfig .Region ][ng .InstanceVolumeType .String ()].PriceGB * float64 (ng .InstanceVolumeSize ) / 30 / 24
878
973
if ng .InstanceVolumeType == clusterconfig .IO1VolumeType && ng .InstanceVolumeIOPS != nil {
879
- ebsPrice += aws .EBSMetadatas [clusterConfig .Region ][ng .InstanceVolumeType .String ()].PriceIOPS * float64 (* ng .InstanceVolumeIOPS ) / 30 / 24
974
+ ebsPrice += awslib .EBSMetadatas [clusterConfig .Region ][ng .InstanceVolumeType .String ()].PriceIOPS * float64 (* ng .InstanceVolumeIOPS ) / 30 / 24
880
975
}
881
976
if ng .InstanceVolumeType == clusterconfig .GP3VolumeType && ng .InstanceVolumeIOPS != nil && ng .InstanceVolumeThroughput != nil {
882
- ebsPrice += libmath .MaxFloat64 (0 , (aws .EBSMetadatas [clusterConfig .Region ][ng .InstanceVolumeType .String ()].PriceIOPS - 3000 )* float64 (* ng .InstanceVolumeIOPS )/ 30 / 24 )
883
- ebsPrice += libmath .MaxFloat64 (0 , (aws .EBSMetadatas [clusterConfig .Region ][ng .InstanceVolumeType .String ()].PriceThroughput - 125 )* float64 (* ng .InstanceVolumeThroughput )/ 30 / 24 )
977
+ ebsPrice += libmath .MaxFloat64 (0 , (awslib .EBSMetadatas [clusterConfig .Region ][ng .InstanceVolumeType .String ()].PriceIOPS - 3000 )* float64 (* ng .InstanceVolumeIOPS )/ 30 / 24 )
978
+ ebsPrice += libmath .MaxFloat64 (0 , (awslib .EBSMetadatas [clusterConfig .Region ][ng .InstanceVolumeType .String ()].PriceThroughput - 125 )* float64 (* ng .InstanceVolumeThroughput )/ 30 / 24 )
884
979
}
885
980
totalEBSPrice := ebsPrice * float64 (numInstances )
886
981
@@ -1040,7 +1135,7 @@ func updateCLIEnv(envName string, operatorEndpoint string, disallowPrompt bool,
1040
1135
return nil
1041
1136
}
1042
1137
1043
- func cmdDebug (awsClient * aws .Client , accessConfig * clusterconfig.AccessConfig ) {
1138
+ func cmdDebug (awsClient * awslib .Client , accessConfig * clusterconfig.AccessConfig ) {
1044
1139
// note: if modifying this string, also change it in files.IgnoreCortexDebug()
1045
1140
debugFileName := fmt .Sprintf ("cortex-debug-%s.tgz" , time .Now ().UTC ().Format ("2006-01-02-15-04-05" ))
1046
1141
@@ -1064,9 +1159,9 @@ func cmdDebug(awsClient *aws.Client, accessConfig *clusterconfig.AccessConfig) {
1064
1159
return
1065
1160
}
1066
1161
1067
- func refreshCachedClusterConfig (awsClient * aws .Client , accessConfig * clusterconfig.AccessConfig , printToStdout bool ) clusterconfig.Config {
1162
+ func refreshCachedClusterConfig (awsClient * awslib .Client , accessConfig * clusterconfig.AccessConfig , printToStdout bool ) clusterconfig.Config {
1068
1163
// add empty file if cached cluster doesn't exist so that the file output by manager container maintains current user permissions
1069
- cachedClusterConfigPath := cachedClusterConfigPath (accessConfig .ClusterName , accessConfig .Region )
1164
+ cachedClusterConfigPath := getCachedClusterConfigPath (accessConfig .ClusterName , accessConfig .Region )
1070
1165
containerConfigPath := fmt .Sprintf ("/out/%s" , filepath .Base (cachedClusterConfigPath ))
1071
1166
1072
1167
copyFromPaths := []dockerCopyFromPath {
@@ -1095,7 +1190,7 @@ func refreshCachedClusterConfig(awsClient *aws.Client, accessConfig *clusterconf
1095
1190
return * refreshedClusterConfig
1096
1191
}
1097
1192
1098
- func createS3BucketIfNotFound (awsClient * aws .Client , bucket string , tags map [string ]string ) error {
1193
+ func createS3BucketIfNotFound (awsClient * awslib .Client , bucket string , tags map [string ]string ) error {
1099
1194
bucketFound , err := awsClient .DoesBucketExist (bucket )
1100
1195
if err != nil {
1101
1196
return err
@@ -1123,7 +1218,7 @@ func createS3BucketIfNotFound(awsClient *aws.Client, bucket string, tags map[str
1123
1218
fmt .Println (" ✓" )
1124
1219
return nil
1125
1220
}
1126
- if ! aws .IsNoSuchBucketErr (err ) {
1221
+ if ! awslib .IsNoSuchBucketErr (err ) {
1127
1222
break
1128
1223
}
1129
1224
time .Sleep (1 * time .Second )
@@ -1133,7 +1228,7 @@ func createS3BucketIfNotFound(awsClient *aws.Client, bucket string, tags map[str
1133
1228
return err
1134
1229
}
1135
1230
1136
- func setLifecycleRulesOnClusterUp (awsClient * aws .Client , bucket , newClusterUID string ) error {
1231
+ func setLifecycleRulesOnClusterUp (awsClient * awslib .Client , bucket , newClusterUID string ) error {
1137
1232
err := awsClient .DeleteLifecycleRules (bucket )
1138
1233
if err != nil {
1139
1234
return err
@@ -1177,7 +1272,7 @@ func setLifecycleRulesOnClusterUp(awsClient *aws.Client, bucket, newClusterUID s
1177
1272
return awsClient .SetLifecycleRules (bucket , rules )
1178
1273
}
1179
1274
1180
- func setLifecycleRulesOnClusterDown (awsClient * aws .Client , bucket string ) error {
1275
+ func setLifecycleRulesOnClusterDown (awsClient * awslib .Client , bucket string ) error {
1181
1276
err := awsClient .DeleteLifecycleRules (bucket )
1182
1277
if err != nil {
1183
1278
return err
@@ -1198,7 +1293,7 @@ func setLifecycleRulesOnClusterDown(awsClient *aws.Client, bucket string) error
1198
1293
})
1199
1294
}
1200
1295
1201
- func createLogGroupIfNotFound (awsClient * aws .Client , logGroup string , tags map [string ]string ) error {
1296
+ func createLogGroupIfNotFound (awsClient * awslib .Client , logGroup string , tags map [string ]string ) error {
1202
1297
logGroupFound , err := awsClient .DoesLogGroupExist (logGroup )
1203
1298
if err != nil {
1204
1299
return err
@@ -1240,7 +1335,7 @@ func (lb LoadBalancer) String() string {
1240
1335
}
1241
1336
1242
1337
// Will return error if the load balancer can't be found
1243
- func getLoadBalancer (clusterName string , whichLB LoadBalancer , awsClient * aws .Client ) (* elbv2.LoadBalancer , error ) {
1338
+ func getLoadBalancer (clusterName string , whichLB LoadBalancer , awsClient * awslib .Client ) (* elbv2.LoadBalancer , error ) {
1244
1339
loadBalancer , err := awsClient .FindLoadBalancer (map [string ]string {
1245
1340
clusterconfig .ClusterNameTag : clusterName ,
1246
1341
"cortex.dev/load-balancer" : whichLB .String (),
@@ -1256,7 +1351,7 @@ func getLoadBalancer(clusterName string, whichLB LoadBalancer, awsClient *aws.Cl
1256
1351
return loadBalancer , nil
1257
1352
}
1258
1353
1259
- func listPVCVolumesForCluster (awsClient * aws .Client , clusterName string ) ([]ec2.Volume , error ) {
1354
+ func listPVCVolumesForCluster (awsClient * awslib .Client , clusterName string ) ([]ec2.Volume , error ) {
1260
1355
return awsClient .ListVolumes (ec2.Tag {
1261
1356
Key : pointer .String (fmt .Sprintf ("kubernetes.io/cluster/%s" , clusterName )),
1262
1357
Value : nil , // any value should be ok as long as the key is present
@@ -1266,3 +1361,41 @@ func listPVCVolumesForCluster(awsClient *aws.Client, clusterName string) ([]ec2.
1266
1361
func filterEKSCTLOutput (out string ) string {
1267
1362
return strings .Join (s .RemoveDuplicates (strings .Split (out , "\n " ), _eksctlPrefixRegex ), "\n " )
1268
1363
}
1364
+
1365
+ func getClusterRESTConfig (awsClient * awslib.Client , clusterName string ) (* rest.Config , error ) {
1366
+ clusterOutput , err := awsClient .EKS ().DescribeCluster (
1367
+ & eks.DescribeClusterInput {
1368
+ Name : aws .String (clusterName ),
1369
+ },
1370
+ )
1371
+ if err != nil {
1372
+ return nil , err
1373
+ }
1374
+
1375
+ gen , err := token .NewGenerator (true , false )
1376
+ if err != nil {
1377
+ return nil , err
1378
+ }
1379
+
1380
+ opts := & token.GetTokenOptions {
1381
+ ClusterID : aws .StringValue (clusterOutput .Cluster .Name ),
1382
+ }
1383
+
1384
+ tok , err := gen .GetWithOptions (opts )
1385
+ if err != nil {
1386
+ return nil , err
1387
+ }
1388
+
1389
+ ca , err := base64 .StdEncoding .DecodeString (aws .StringValue (clusterOutput .Cluster .CertificateAuthority .Data ))
1390
+ if err != nil {
1391
+ return nil , err
1392
+ }
1393
+
1394
+ return & rest.Config {
1395
+ Host : aws .StringValue (clusterOutput .Cluster .Endpoint ),
1396
+ BearerToken : tok .Token ,
1397
+ TLSClientConfig : rest.TLSClientConfig {
1398
+ CAData : ca ,
1399
+ },
1400
+ }, nil
1401
+ }
0 commit comments